def test_mapr_db_destination(sdc_builder, sdc_executor, cluster): """Write a handful of records to the MapR-DB destination and confirm their presence with an HBase client. dev_raw_data_source >> mapr_db """ # Generate some data. bike_brands = [dict(name='Bianchi'), dict(name='BMC'), dict(name='Cannondale'), dict(name='Specialized')] raw_data = ''.join(json.dumps(brand) for brand in bike_brands) table_name = '/user/sdc/{}'.format(get_random_string(string.ascii_letters, 10)) pipeline_builder = sdc_builder.get_pipeline_builder() dev_raw_data_source = pipeline_builder.add_stage('Dev Raw Data Source') dev_raw_data_source.set_attributes(data_format='JSON', raw_data=raw_data) mapr_db = pipeline_builder.add_stage('MapR DB', type='destination') mapr_db.set_attributes(table_name=table_name, row_key='/name', fields=[dict(columnValue='/name', columnStorageType='TEXT', columnName='cf1:cq1')]) dev_raw_data_source >> mapr_db pipeline = pipeline_builder.build().configure_for_environment(cluster) try: logger.info('Creating MapR-DB table %s ...', table_name) cluster.execute_command('table', 'create', http_request_method='POST', data={'path': table_name, 'defaultreadperm': 'p', 'defaultwriteperm': 'p'}) cluster.execute_command('table', 'cf', 'create', http_request_method='POST', data={'path': table_name, 'cfname': 'cf1'}) sdc_executor.add_pipeline(pipeline) sdc_executor.start_pipeline(pipeline).wait_for_pipeline_batch_count(len(bike_brands)) table = cluster.mapr_db.client.table(name=table_name) # Due to the following bug in MapR 6.0.1 MEP 5.0, MapR DB table.scan() call hangs and times out. # https://mapr.com/support/s/article/Hung-issue-when-using-HappyBase-python-to-SCAN-MapRDB?language=ja%29 # Hence read the database table by using table.row() call instead of whole table scan. result = [(bike_brand['name'].encode(), table.row(bike_brand['name'].encode())) for bike_brand in bike_brands] # Bike brands are stored in a list of dicts ('name' => brand). Manipulate this to match what we # expect our MapR-DB rows to look like (including putting them in lexicographic order). assert [(bike_brand['name'].encode(), {b'cf1:cq1': bike_brand['name'].encode()}) for bike_brand in bike_brands] == result finally: logger.info('Deleting MapR-DB table %s ...', table_name) cluster.execute_command('table', 'delete', http_request_method='POST', data={'path': table_name}) sdc_executor.stop_pipeline(pipeline)
def test_mapr_json_db_cdc_origin(sdc_builder, sdc_executor, cluster): """Insert, update, delete a handful of records in the MapR-DB json table using a pipeline. After that create another pipeline with CDC Consumer and verify with snapshot that MapR DB CDC consumer gets the correct data. dev_raw_data_source >> expression evaluator >> field_remover >> mapr_db_json mapr_db_cdc_consumer >> trash """ if not cluster.version[len('mapr'):].startswith('6'): pytest.skip( 'MapR CDC test only runs against cluster with MapR version 6.') if cluster.mep_version == "4.0": pytest.skip('MapR CDC test are written only for MEP 5 and above.') table_name = get_random_string(string.ascii_letters, 10) topic_name = f'{table_name}-topic' table_path = f'/user/sdc/{table_name}' stream_name = f'/{get_random_string(string.ascii_letters, 10)}' # Generate some data. test_data = [ dict(_id='1', name='Sachin Tendulkar', operation='insert', average=53.79, is_alive=True, runs_bf=1592129437, innings=329), dict(_id='2', name='Don Bradman', operation='insert', average=53.79, is_alive=False, runs_bf=69969798, innings=80), dict(_id='3', name='Gary Sobers', operation='insert', average=57.78, is_alive=True, runs_bf=80323867, innings=160), dict(_id='1', name='Sachin', operation='update'), dict(_id='2', name='Don', operation='update'), dict(_id='3', operation='delete') ] raw_data = ''.join(json.dumps(record) for record in test_data) # Expected final data, field remover stage will have the operation field removed final_data = [ dict(_id='1', name='Sachin', average=53.79, is_alive=True, runs_bf=1592129437, innings=329), dict(_id='2', name='Don', average=53.79, is_alive=False, runs_bf=69969798, innings=80) ] # Build the MapR JSON DB pipeline. pipeline_builder = sdc_builder.get_pipeline_builder() dev_raw_data_source = pipeline_builder.add_stage( 'Dev Raw Data Source').set_attributes(data_format='JSON', stop_after_first_batch=True, raw_data=raw_data) expression_evaluator = pipeline_builder.add_stage('Expression Evaluator') header_attribute_expressions = ( "${record:value('/operation')=='insert'?1:" "record:value('/operation')=='update'?3:" "record:value('/operation')=='delete'?2:1}") expression_evaluator.set_attributes(header_attribute_expressions=[{ 'attributeToSet': 'sdc.operation.type', 'headerAttributeExpression': header_attribute_expressions }]) field_remover = pipeline_builder.add_stage('Field Remover') field_remover.set_attributes(fields=['/operation']) mapr_db_json_destination = pipeline_builder.add_stage('MapR DB JSON', type='destination') mapr_db_json_destination.set_attributes(table_name=table_path, row_key='/_id') dev_raw_data_source >> expression_evaluator >> field_remover >> mapr_db_json_destination json_db_destination_pipeline = pipeline_builder.build( 'MapR Json DB Destination').configure_for_environment(cluster) # Build the MapR DB CDC Consumer pipeline. pipeline_builder = sdc_builder.get_pipeline_builder() mapr_db_cdc_consumer = pipeline_builder.add_stage('MapR DB CDC Consumer', type='origin') mapr_db_cdc_consumer.set_attributes( mapr_streams_configuration=[ dict(key='auto.offset.reset', value='earliest') ], number_of_threads=1, topic_list=[ dict(key=f'{stream_name}:{topic_name}', value=f'{table_path}') ]) trash = pipeline_builder.add_stage('Trash') mapr_db_cdc_consumer >> trash cdc_pipeline = pipeline_builder.build( 'MapR DB CDC Consumer').configure_for_environment(cluster) # Build the MapR DB JSON Consumer pipeline. pipeline_builder = sdc_builder.get_pipeline_builder() mapr_db_json_origin = pipeline_builder.add_stage('MapR DB JSON Origin') mapr_db_json_origin.set_attributes(table_name=table_path) trash = pipeline_builder.add_stage('Trash') mapr_db_json_origin >> trash json_db_origin_pipeline = pipeline_builder.build( 'MapR Json DB Origin').configure_for_environment(cluster) try: logger.info('Creating MapR-DB table %s ...', table_path) cluster.execute_command('table', 'create', http_request_method='POST', data={ 'path': table_path, 'defaultreadperm': 'p', 'tabletype': 'json', 'defaultwriteperm': 'p' }) logger.info('Creating MapR stream %s ...', stream_name) cluster.execute_command('stream', 'create', http_request_method='POST', data={ 'path': stream_name, 'ischangelog': 'true', 'consumeperm': 'p', 'defaultpartitions': 1 }) changelog = f'{stream_name}:{topic_name}' logger.info('Creating MapR-DB table changelog %s ...', changelog) cluster.execute_command('table', 'changelog', 'add', http_request_method='POST', data={ 'path': table_path, 'changelog': changelog }) sdc_executor.add_pipeline(json_db_destination_pipeline, cdc_pipeline, json_db_origin_pipeline) sdc_executor.start_pipeline(json_db_destination_pipeline) cdc_pipeline_command = sdc_executor.capture_snapshot( cdc_pipeline, start_pipeline=True, wait=False) json_origin_pipeline_command = sdc_executor.capture_snapshot( json_db_origin_pipeline, start_pipeline=True, wait=False) # Verify with a snapshot. cdc_snapshot = cdc_pipeline_command.wait_for_finished( timeout_sec=120).snapshot json_snapshot = json_origin_pipeline_command.wait_for_finished( timeout_sec=120).snapshot sdc_executor.stop_pipeline(cdc_pipeline) sdc_executor.stop_pipeline(json_db_origin_pipeline) actual_cdc = [ record.field for record in cdc_snapshot[mapr_db_cdc_consumer].output ] for record in test_data: # In the pipeline, Field Remover stage removed field 'operation' and so it will not be present in actual. # Remove it from test_data, for verification with assert. record.pop('operation') actual_json = [ record.field for record in json_snapshot[mapr_db_json_origin].output ] assert actual_cdc == test_data assert actual_json == final_data finally: logger.info('Deleting MapR-DB table changelog %s ...', f'{stream_name}:{topic_name}') cluster.execute_command('table', 'changelog', 'remove', http_request_method='POST', data={ 'path': table_path, 'changelog': f'{stream_name}:{topic_name}' }) logger.info('Deleting MapR stream %s ...', stream_name) cluster.execute_command('stream', 'delete', http_request_method='POST', data={'path': stream_name}) logger.info('Deleting MapR-DB table %s ...', table_path) cluster.execute_command('table', 'delete', http_request_method='POST', data={'path': table_path})
def test_mapr_db_cdc_origin_preview(sdc_builder, sdc_executor, cluster, input_records): """We had an issue in which preview pipeline committed records read from streams, which made actual runs not read those records. This test will preview the pipeline and then assert we have the expected number of records. dev_data_generator >> expression evaluator >> field_remover >> mapr_db_json mapr_db_cdc_consumer >> wiretap """ if not cluster.version[len('mapr'):].startswith('6'): pytest.skip('MapR CDC test only runs against cluster with MapR version 6.') if cluster.mep_version == "4.0": pytest.skip('MapR CDC test are written only for MEP 5 and above.') table_name = get_random_string(string.ascii_letters, 10) topic_name = f'{table_name}-topic' table_path = f'/user/sdc/{table_name}' stream_name = f'/{get_random_string(string.ascii_letters, 10)}' # Build the MapR JSON DB pipeline. pipeline_builder = sdc_builder.get_pipeline_builder() dev_data_generator = pipeline_builder.add_stage('Dev Data Generator') dev_data_generator.set_attributes(records_to_be_generated=input_records) dev_data_generator.fields_to_generate = [ {'field': '_id', 'type': 'STRING'}, {'field': 'name', 'type': 'STRING'}, {'field': 'address', 'type': 'STRING'}, {'field': 'mail', 'type': 'STRING'}, ] expression_evaluator = pipeline_builder.add_stage('Expression Evaluator') header_attribute_expressions = ("${record:value('/operation')=='insert'?1:" "record:value('/operation')=='update'?3:" "record:value('/operation')=='delete'?2:1}") expression_evaluator.set_attributes(header_attribute_expressions=[ {'attributeToSet': 'sdc.operation.type', 'headerAttributeExpression': header_attribute_expressions} ]) field_remover = pipeline_builder.add_stage('Field Remover') field_remover.set_attributes(fields=['/operation']) mapr_db_json_destination = pipeline_builder.add_stage('MapR DB JSON', type='destination') mapr_db_json_destination.set_attributes(table_name=table_path, row_key='/_id') dev_data_generator >> expression_evaluator >> field_remover >> mapr_db_json_destination json_db_destination_pipeline = pipeline_builder.build('MapR Json DB Destination').configure_for_environment(cluster) # Build the MapR DB CDC Consumer pipeline. pipeline_builder = sdc_builder.get_pipeline_builder() mapr_db_cdc_consumer = pipeline_builder.add_stage('MapR DB CDC Consumer', type='origin') mapr_db_cdc_consumer.set_attributes(mapr_streams_configuration=[dict(key='auto.offset.reset', value='earliest')], number_of_threads=1, topic_list=[dict(key=f'{stream_name}:{topic_name}', value=f'{table_path}')]) wiretap_cdc = pipeline_builder.add_wiretap() mapr_db_cdc_consumer >> wiretap_cdc.destination cdc_pipeline = pipeline_builder.build('MapR DB CDC Consumer').configure_for_environment(cluster) try: logger.info('Creating MapR-DB table %s ...', table_path) cluster.execute_command('table', 'create', http_request_method='POST', data={'path': table_path, 'defaultreadperm': 'p', 'tabletype': 'json', 'defaultwriteperm': 'p'}) logger.info('Creating MapR stream %s ...', stream_name) cluster.execute_command('stream', 'create', http_request_method='POST', data={'path': stream_name, 'ischangelog': 'true', 'consumeperm': 'p', 'defaultpartitions': 1}) changelog = f'{stream_name}:{topic_name}' logger.info('Creating MapR-DB table changelog %s ...', changelog) cluster.execute_command('table', 'changelog', 'add', http_request_method='POST', data={'path': table_path, 'changelog': changelog}) sdc_executor.add_pipeline(json_db_destination_pipeline, cdc_pipeline) sdc_executor.start_pipeline(json_db_destination_pipeline).wait_for_finished() preview = sdc_executor.run_pipeline_preview(cdc_pipeline, timeout=30000).preview assert preview is not None assert preview.issues.issues_count == 0 # We first assert preview has the default 10 records assert len(preview[mapr_db_cdc_consumer].output) == 10 sdc_executor.start_pipeline(cdc_pipeline) sdc_executor.wait_for_pipeline_metric(cdc_pipeline, 'input_record_count', input_records, timeout_sec=90) actual_cdc = [record.field for record in wiretap_cdc.output_records] # And second that we actually consumed the input_records records, and not 10 less assert len(actual_cdc) == input_records finally: if sdc_executor.get_pipeline_status(cdc_pipeline).response.json().get('status') == 'RUNNING': sdc_executor.stop_pipeline(cdc_pipeline) logger.info('Deleting MapR-DB table changelog %s ...', f'{stream_name}:{topic_name}') cluster.execute_command('table', 'changelog', 'remove', http_request_method='POST', data={'path': table_path, 'changelog': f'{stream_name}:{topic_name}'}) logger.info('Deleting MapR stream %s ...', stream_name) cluster.execute_command('stream', 'delete', http_request_method='POST', data={'path': stream_name}) logger.info('Deleting MapR-DB table %s ...', table_path) cluster.execute_command('table', 'delete', http_request_method='POST', data={'path': table_path})
def test_mapr_db_destination(sdc_builder, sdc_executor, cluster): """Write a handful of records to the MapR-DB destination and confirm their presence with an HBase client. dev_raw_data_source >> mapr_db """ # Generate some data. bike_brands = [ dict(name='Cannondale'), dict(name='Specialized'), dict(name='Bianchi'), dict(name='BMC') ] raw_data = ''.join(json.dumps(brand) for brand in bike_brands) table_name = '/user/sdc/{}'.format( get_random_string(string.ascii_letters, 10)) pipeline_builder = sdc_builder.get_pipeline_builder() dev_raw_data_source = pipeline_builder.add_stage('Dev Raw Data Source') dev_raw_data_source.set_attributes(data_format='JSON', raw_data=raw_data) mapr_db = pipeline_builder.add_stage('MapR DB', type='destination') mapr_db.set_attributes(table_name=table_name, row_key='/name', fields=[ dict(columnValue='/name', columnStorageType='TEXT', columnName='cf1:cq1') ]) dev_raw_data_source >> mapr_db pipeline = pipeline_builder.build().configure_for_environment(cluster) try: logger.info('Creating MapR-DB table %s ...', table_name) cluster.execute_command('table', 'create', http_request_method='POST', data={ 'path': table_name, 'defaultreadperm': 'p', 'defaultwriteperm': 'p' }) cluster.execute_command('table', 'cf', 'create', http_request_method='POST', data={ 'path': table_name, 'cfname': 'cf1' }) sdc_executor.add_pipeline(pipeline) sdc_executor.start_pipeline(pipeline).wait_for_pipeline_batch_count( len(bike_brands)) rows = [(key, value) for key, value in cluster.mapr_db.client.table( name=table_name).scan()] # Bike brands are stored in a list of dicts ('name' => brand). Manipulate this to match what we # expect our MapR-DB rows to look like (including putting them in lexicographic order). assert sorted((bike_brand['name'].encode(), { b'cf1:cq1': bike_brand['name'].encode() }) for bike_brand in bike_brands) == rows finally: logger.info('Deleting MapR-DB table %s ...', table_name) cluster.execute_command('table', 'delete', http_request_method='POST', data={'path': table_name}) sdc_executor.stop_pipeline(pipeline)