Exemplo n.º 1
0
def test_postgres_cdc_max_poll_attempts(sdc_builder, sdc_executor, database):
    """Test the delivery of a batch when the maximum poll attempts is reached.

    The condition to generate a new batch in PostgreSQL CDC Origin is a) to reach the maximum batch size; or b) to
    reach the maximum attempts to poll data from CDC. This test set a max batch size of 100 records and check a new
    batch is generated with only a few records because of hitting the max poll attempts.

    Pipeline:
        postgres_cdc_client >> trash

    """
    if not database.is_cdc_enabled:
        pytest.skip('Test only runs against PostgreSQL with CDC enabled.')

    table_name = get_random_string(string.ascii_lowercase, 20)

    pipeline_builder = sdc_builder.get_pipeline_builder()
    postgres_cdc_client = pipeline_builder.add_stage('PostgreSQL CDC Client')
    replication_slot_name = get_random_string(string.ascii_lowercase, 10)
    postgres_cdc_client.set_attributes(remove_replication_slot_on_close=True,
                                       max_batch_size_in_records=100,
                                       poll_interval=POLL_INTERVAL,
                                       replication_slot=replication_slot_name)
    trash = pipeline_builder.add_stage('Trash')
    postgres_cdc_client >> trash

    pipeline = pipeline_builder.build().configure_for_environment(database)
    sdc_executor.add_pipeline(pipeline)

    try:
        # Database operations done after pipeline start will be captured by CDC.
        # Hence start the pipeline but do not wait for the capture to be finished.
        snapshot_command = sdc_executor.capture_snapshot(pipeline, start_pipeline=True, wait=False)

        # Create table, perform a few insertions, and then wait for the pipeline a period of time enough to hit the
        # max poll attempts (max poll attempts == POLL_INTERVAL * 100).
        table = _create_table_in_database(table_name, database)
        connection = database.engine.connect()
        expected_operations_data = _insert(connection=connection, table=table)
        snapshot = snapshot_command.wait_for_finished(120).snapshot

        # Verify snapshot data is received in exact order as expected.
        for record in snapshot[postgres_cdc_client.instance_name].output:
            # No need to worry about DDL related CDC records. e.g. table creation etc.
            if record.get_field_data('/change'):
                # Check that the CDC record change contains a list of 3 insertions.
                for i in range(len(INSERT_ROWS)):
                    expected = expected_operations_data[i]
                    assert expected.kind == record.get_field_data(f'/change[{i}]/kind')
                    assert expected.table == record.get_field_data(f'/change[{i}]/table')
                    assert expected.columnnames == record.get_field_data(f'/change[{i}]/columnnames')
                    assert expected.columnvalues == record.get_field_data(f'/change[{i}]/columnvalues')

    finally:
        if pipeline:
            sdc_executor.stop_pipeline(pipeline=pipeline, force=True)
        database.deactivate_and_drop_replication_slot(replication_slot_name)
        if table is not None:
            table.drop(database.engine)
            logger.info('Table: %s dropped.', table_name)
Exemplo n.º 2
0
def test_postgres_cdc_client_multiple_concurrent_insertions(
        sdc_builder, sdc_executor, database):
    """Basic test that inserts to a Postgres table with multiple threads,
    and validates via timeout checking that all CDCs are received,
    and not filtered, by SDC
    Here `Initial Change` config. is at default value = `From the latest change`.
    With this, the origin processes all changes that occur after pipeline is started.

    The pipeline looks like:
        postgres_cdc_client >> trash
    """
    if not database.is_cdc_enabled:
        pytest.skip('Test only runs against PostgreSQL with CDC enabled.')

    table_name = get_random_string(string.ascii_lowercase, 20)

    pipeline_builder = sdc_builder.get_pipeline_builder()
    postgres_cdc_client = pipeline_builder.add_stage('PostgreSQL CDC Client')
    replication_slot_name = get_random_string(string.ascii_lowercase, 10)
    postgres_cdc_client.set_attributes(remove_replication_slot_on_close=True,
                                       max_batch_size_in_records=1,
                                       poll_interval=POLL_INTERVAL,
                                       replication_slot=replication_slot_name)
    trash = pipeline_builder.add_stage('Trash')
    postgres_cdc_client >> trash

    pipeline = pipeline_builder.build().configure_for_environment(database)
    sdc_executor.add_pipeline(pipeline)

    try:
        start_command = sdc_executor.start_pipeline(pipeline)
        # Create table and then perform insert operations with various threads at the same time.
        table = _create_table_in_database(table_name, database)
        connections = [database.engine.connect() for _ in range(NUM_THREADS)]

        def inserter_thread(connection, table, id, amount):
            for i in range(amount):
                insert_rows = [{
                    PRIMARY_KEY:
                    id * amount + i,
                    NAME_COLUMN:
                    get_random_string(string.ascii_lowercase, 10)
                }]
                _insert(connection=connection,
                        table=table,
                        insert_rows=insert_rows,
                        create_txn=True)

        thread_pool = [
            threading.Thread(target=inserter_thread,
                             args=(connections[i], table, i,
                                   INSERTS_PER_THREAD))
            for i in range(NUM_THREADS)
        ]

        for thread in thread_pool:
            thread.start()

        for thread in thread_pool:
            thread.join()

        start_command.wait_for_pipeline_batch_count(TOTAL_THREADING_RECORDS)

    finally:
        if pipeline:
            sdc_executor.stop_pipeline(pipeline=pipeline, force=True)
        database.deactivate_and_drop_replication_slot(replication_slot_name)
        if table is not None:
            table.drop(database.engine)
            logger.info('Table: %s dropped.', table_name)
Exemplo n.º 3
0
def test_postgres_cdc_client_filtering_table(sdc_builder, sdc_executor,
                                             database):
    """
        Test filtering for inserts/updates/deletes to a Postgres table

        1. Random table names for "table_allow", "table_deny"
        2. Filter OUT anything for "table_deny"
        3. Insert/update/delete for both tables
        4. Should see updates for "table_allow" only

        The pipeline looks like:
        postgres_cdc_client >> trash
    """
    if not database.is_cdc_enabled:
        pytest.skip('Test only runs against PostgreSQL with CDC enabled.')

    table_name_allow = get_random_string(string.ascii_lowercase, 20)
    table_name_deny = get_random_string(string.ascii_lowercase, 20)

    pipeline_builder = sdc_builder.get_pipeline_builder()
    postgres_cdc_client = pipeline_builder.add_stage('PostgreSQL CDC Client')
    replication_slot_name = get_random_string(string.ascii_lowercase, 10)

    postgres_cdc_client.set_attributes(remove_replication_slot_on_close=True,
                                       replication_slot=replication_slot_name,
                                       max_batch_size_in_records=1,
                                       poll_interval=POLL_INTERVAL,
                                       tables=[{
                                           'schema': 'public',
                                           'excludePattern': table_name_deny,
                                           'table': table_name_allow
                                       }])
    trash = pipeline_builder.add_stage('Trash')
    postgres_cdc_client >> trash

    pipeline = pipeline_builder.build().configure_for_environment(database)
    sdc_executor.add_pipeline(pipeline)

    try:
        # Database operations done after pipeline start will be captured by CDC.
        # Hence start the pipeline but do not wait for the capture to be finished.
        snapshot_command = sdc_executor.capture_snapshot(pipeline,
                                                         start_pipeline=True,
                                                         wait=False)

        # Create table and then perform insert, update and delete operations.
        table_allow = _create_table_in_database(table_name_allow, database)
        table_deny = _create_table_in_database(table_name_deny, database)
        connection = database.engine.connect()

        expected_operations_data = _insert(connection=connection,
                                           table=table_allow)
        expected_operations_data += _update(connection=connection,
                                            table=table_allow)
        expected_operations_data += _delete(connection=connection,
                                            table=table_allow)

        actual_operations_data = expected_operations_data.copy()

        actual_operations_data += _insert(connection=connection,
                                          table=table_deny)
        actual_operations_data += _update(connection=connection,
                                          table=table_deny)
        actual_operations_data += _delete(connection=connection,
                                          table=table_deny)

        snapshot = snapshot_command.wait_for_finished().snapshot

        # Verify snapshot data is received in exact order as expected.
        operation_index = 0

        for record in snapshot[postgres_cdc_client.instance_name].output:
            # No need to worry about DDL related CDC records. e.g. table creation etc.
            if record.get_field_data('/change'):
                # Since we performed each operation (insert, update and delete) on 3 rows,
                # each CDC  record change contains a list of 3 elements.
                for i in range(3):
                    if operation_index >= len(expected_operations_data):
                        break
                    expected = expected_operations_data[operation_index]
                    assert expected.kind == record.get_field_data(
                        f'/change[{i}]/kind')
                    assert expected.table == record.get_field_data(
                        f'/change[{i}]/table')
                    # For delete operation there are no columnnames and columnvalues fields.
                    if expected.kind != KIND_FOR_DELETE:
                        assert expected.columnnames == record.get_field_data(
                            f'/change[{i}]/columnnames')
                        assert expected.columnvalues == record.get_field_data(
                            f'/change[{i}]/columnvalues')
                    if expected.kind != KIND_FOR_INSERT:
                        # For update and delete operations verify extra information about old keys.
                        assert expected.oldkeys.keynames == record.get_field_data(
                            f'/change[{i}]/oldkeys/keynames')
                        assert expected.oldkeys.keyvalues == record.get_field_data(
                            f'/change[{i}]/oldkeys/keyvalues')
                    operation_index += 1

    finally:
        if pipeline:
            sdc_executor.stop_pipeline(pipeline=pipeline, force=True)
        database.deactivate_and_drop_replication_slot(replication_slot_name)
        if table_allow is not None:
            table_allow.drop(database.engine)
            logger.info('Table: %s dropped.', table_name_allow)
        if table_deny is not None:
            table_deny.drop(database.engine)
            logger.info('Table: %s dropped.', table_name_deny)
Exemplo n.º 4
0
def test_postgres_cdc_client_basic(sdc_builder, sdc_executor, database):
    """Basic test that inserts/updates/deletes to a Postgres table,
    and validates that they are read in the same order.
    Here `Initial Change` config. is at default value = `From the latest change`.
    With this, the origin processes all changes that occur after pipeline is started.

    The pipeline looks like:
        postgres_cdc_client >> trash
    """
    if not database.is_cdc_enabled:
        pytest.skip('Test only runs against PostgreSQL with CDC enabled.')

    table_name = get_random_string(string.ascii_lowercase, 20)

    pipeline_builder = sdc_builder.get_pipeline_builder()
    postgres_cdc_client = pipeline_builder.add_stage('PostgreSQL CDC Client')
    replication_slot_name = get_random_string(string.ascii_lowercase, 10)
    postgres_cdc_client.set_attributes(remove_replication_slot_on_close=True,
                                       max_batch_size_in_records=1,
                                       poll_interval=POLL_INTERVAL,
                                       replication_slot=replication_slot_name)
    trash = pipeline_builder.add_stage('Trash')
    postgres_cdc_client >> trash

    pipeline = pipeline_builder.build().configure_for_environment(database)
    sdc_executor.add_pipeline(pipeline)

    try:
        # Database operations done after pipeline start will be captured by CDC.
        # Hence start the pipeline but do not wait for the capture to be finished.
        snapshot_command = sdc_executor.capture_snapshot(pipeline,
                                                         start_pipeline=True,
                                                         wait=False)

        # Create table and then perform insert, update and delete operations.
        table = _create_table_in_database(table_name, database)
        connection = database.engine.connect()
        expected_operations_data = _insert(connection=connection, table=table)
        expected_operations_data += _update(connection=connection, table=table)
        expected_operations_data += _delete(connection=connection, table=table)

        snapshot = snapshot_command.wait_for_finished().snapshot

        # Verify snapshot data is received in exact order as expected.
        operation_index = 0
        for record in snapshot[postgres_cdc_client.instance_name].output:
            # No need to worry about DDL related CDC records. e.g. table creation etc.
            if record.get_field_data('/change'):
                # Since we performed each operation (insert, update and delete) on 3 rows,
                # each CDC  record change contains a list of 3 elements.
                for i in range(3):
                    expected = expected_operations_data[operation_index]
                    assert expected.kind == record.get_field_data(
                        f'/change[{i}]/kind')
                    assert expected.table == record.get_field_data(
                        f'/change[{i}]/table')
                    # For delete operation there are no columnnames and columnvalues fields.
                    if expected.kind != KIND_FOR_DELETE:
                        assert expected.columnnames == record.get_field_data(
                            f'/change[{i}]/columnnames')
                        assert expected.columnvalues == record.get_field_data(
                            f'/change[{i}]/columnvalues')
                    if expected.kind != KIND_FOR_INSERT:
                        # For update and delete operations verify extra information about old keys.
                        assert expected.oldkeys.keynames == record.get_field_data(
                            f'/change[{i}]/oldkeys/keynames')
                        assert expected.oldkeys.keyvalues == record.get_field_data(
                            f'/change[{i}]/oldkeys/keyvalues')
                    operation_index += 1

    finally:
        if pipeline:
            sdc_executor.stop_pipeline(pipeline=pipeline, force=True)
        database.deactivate_and_drop_replication_slot(replication_slot_name)
        if table is not None:
            table.drop(database.engine)
            logger.info('Table: %s dropped.', table_name)
Exemplo n.º 5
0
def test_postgres_cdc_client_multiple_concurrent_operations(sdc_builder, sdc_executor, database, batch_size):
    """Basic test that inserts/update/delete to a Postgres table with multiple threads,
    and validates using a wire tap the records processed.
    Here `Initial Change` config. is at default value = `From the latest change`.
    With this, the origin processes all changes that occur after pipeline is started.

    The pipeline looks like:
        postgres_cdc_client >> [pipeline_finesher, wiretap]
    """
    if not database.is_cdc_enabled:
        pytest.skip('Test only runs against PostgreSQL with CDC enabled.')

    table_name = get_random_string(string.ascii_lowercase, 20)

    pipeline_builder = sdc_builder.get_pipeline_builder()
    postgres_cdc_client = pipeline_builder.add_stage('PostgreSQL CDC Client')
    replication_slot_name = get_random_string(string.ascii_lowercase, 10)
    postgres_cdc_client.set_attributes(remove_replication_slot_on_close=False,
                                       max_batch_size_in_records=batch_size,
                                       poll_interval=POLL_INTERVAL,
                                       replication_slot=replication_slot_name,
                                       batch_wait_time_in_ms=3000
                                       )
    wiretap = pipeline_builder.add_wiretap()

    pipeline_finisher = pipeline_builder.add_stage('Pipeline Finisher Executor')
    # We want the pipeline to stop automatically part-way through processing batch 1 and at the end of batch 2.
    pipeline_finisher.set_attributes(preconditions=[
        "${record:value('/change[0]/columnvalues[0]') == -1}"
    ])

    postgres_cdc_client >> [wiretap.destination, pipeline_finisher]

    pipeline = pipeline_builder.build().configure_for_environment(database)
    sdc_executor.add_pipeline(pipeline)

    try:
        pipeline_cmd = sdc_executor.start_pipeline(pipeline)

        # Create table and then perform insert operations with various threads at the same time.
        table = _create_table_in_database(table_name, database)
        connections = [database.engine.connect() for _ in range(NUM_THREADS)]

        expected = []

        def inserter_thread(connection, table, id, amount):
            for i in range(amount):
                insert_rows = [
                    {
                        PRIMARY_KEY: id * amount + i,
                        NAME_COLUMN: get_random_string(string.ascii_lowercase, 10)
                    }
                ]
                expected.append(_insert(
                    connection=connection,
                    table=table,
                    insert_rows=insert_rows,
                    create_txn=True
                ))
                insert_rows = [
                    {
                        PRIMARY_KEY: id * amount + i,
                        NAME_COLUMN: get_random_string(string.ascii_lowercase, 10)
                    }
                ]
                expected.append(_update(
                    connection=connection,
                    table=table,
                    update_rows=insert_rows
                ))
                expected.append(_delete(
                    connection=connection,
                    table=table,
                    delete_rows=insert_rows
                ))

        thread_pool = [
            threading.Thread(
                target=inserter_thread,
                args=(connections[i], table, i, INSERTS_PER_THREAD)
            )
            for i in range(NUM_THREADS)
        ]

        for thread in thread_pool:
            thread.start()

        for thread in thread_pool:
            thread.join()

        final_row = [{PRIMARY_KEY: -1, NAME_COLUMN: 'Last Record'}]
        expected.append(_insert(
            connection=connections[0],
            table=table,
            insert_rows=final_row,
            create_txn=True
        ))
        pipeline_cmd.wait_for_finished(timeout_sec=120)

        output = []
        for record in wiretap.output_records:
            if record.get_field_data('/change[0]/kind') == 'delete':
                output.append({'type': 'delete', 'value': record.get_field_data('/change[0]/oldkeys/keyvalues')})
            if record.get_field_data('/change[0]/kind') == 'insert':
                output.append({'type': 'insert', 'value': record.get_field_data('/change[0]/columnvalues')})
            if record.get_field_data('/change[0]/kind') == 'update':
                output.append({'type': 'update', 'value': record.get_field_data('/change[0]/columnvalues')})

        output_sorted_values = sorted(output, key=lambda key: f'{key["value"][0]}|{key["type"]}')

        expected_values = []
        for record in expected:
            if record[0].kind == 'delete':
                expected_values.append({'type': 'delete', 'value': record[0].oldkeys.keyvalues})
            if record[0].kind == 'insert':
                expected_values.append({'type': 'insert', 'value': record[0].columnvalues})
            if record[0].kind == 'update':
                expected_values.append({'type': 'update', 'value': record[0].columnvalues})

        expected_sorted_values = sorted(expected_values, key=lambda key: f'{key["value"][0]}|{key["type"]}')

        assert len(expected_sorted_values) == len(output_sorted_values)
        assert expected_sorted_values == output_sorted_values

    finally:
        if sdc_executor.get_pipeline_status(pipeline).response.json().get('status') == 'RUNNING':
            sdc_executor.stop_pipeline(pipeline=pipeline, force=True)
        database.deactivate_and_drop_replication_slot(replication_slot_name)
        if table is not None:
            table.drop(database.engine)
            logger.info('Table: %s dropped.', table_name)
Exemplo n.º 6
0
def test_start_not_from_latest(sdc_builder, sdc_executor, database, start_from, create_slot):
    """
    We test that start from LSN and Date works as expected, for that we insert some data, get the date/lsn and insert
    some more data, after that we verify that we only process the second batch inserted.

    After that we insert a third batch of records and start again the pipeline to verify that we don't read any
    duplicated record.

    Apart from that we also included a case where the replication slot is created by the pipeline it set during the
    start, in that case we need to insert new data after it gets created so, there is a fourth batch of records that is
    inserted and processed.
    """

    if not database.is_cdc_enabled:
        pytest.skip('Test only runs against PostgreSQL with CDC enabled.')

    if start_from is 'LSN' and database.database_server_version.major < 10:
        pytest.skip('LSN test cannot be executed in versions < 10.')

    SAMPLE_DATA = [dict(id=f'1{i}', name=f'Alex_{i}') for i in range(20)]
    SAMPLE_DATA_2 = [dict(id=f'2{i}', name=f'Martin_{i}') for i in range(20)]
    SAMPLE_DATA_3 = [dict(id=f'3{i}', name=f'Santhosh_{i}') for i in range(20)]
    SAMPLE_DATA_4 = [dict(id=f'4{i}', name=f'Tucu_{i}') for i in range(20)]

    table_name = get_random_string(string.ascii_lowercase, 20)
    table = sqlalchemy.Table(table_name,
                             sqlalchemy.MetaData(),
                             sqlalchemy.Column('id', sqlalchemy.String(20), primary_key=True),
                             sqlalchemy.Column('name', sqlalchemy.String(20)))
    replication_slot = get_random_string(string.ascii_lowercase, 10)

    try:
        table.create(database.engine)

        if create_slot:
            # create replication slot
            with database.engine.connect().execution_options(autocommit=True) as connection:
                connection.execute(
                    f'SELECT * FROM pg_create_logical_replication_slot(\'{replication_slot}\', \'wal2json\')')

        # insert first batch of data
        with database.engine.connect().execution_options(autocommit=True) as connection:
            for row in SAMPLE_DATA:
                connection.execute(table.insert(), row)

        if start_from is 'DATE':
            # get timestamp from database and timezone
            time.sleep(5)
            with database.engine.connect().execution_options(autocommit=True) as connection:
                date = connection.execute('SELECT CURRENT_TIMESTAMP').first()[0]
                timezone = str(connection.execute('SHOW timezone').first()[0])
        else:
            # get current lsn from replication slot
            with database.engine.connect().execution_options(autocommit=True) as connection:
                start_lsn = str(connection.execute('select pg_current_wal_lsn()').first()[0])

        # insert second batch of data
        with database.engine.connect().execution_options(autocommit=True) as connection:
            for row in SAMPLE_DATA_2:
                connection.execute(table.insert(), row)

        pipeline_builder = sdc_builder.get_pipeline_builder()
        postgresql_cdc_client = pipeline_builder.add_stage('PostgreSQL CDC Client')
        postgresql_cdc_client.set_attributes(replication_slot=replication_slot,
                                             initial_change=start_from,
                                             poll_interval=1)

        if start_from is 'DATE':
            postgresql_cdc_client.set_attributes(start_date=date.strftime('%m-%d-%Y %H:%M:%S'),
                                                 db_time_zone=timezone)
        else:
            postgresql_cdc_client.set_attributes(start_lsn=start_lsn)

        wiretap = pipeline_builder.add_wiretap()
        pipeline_finisher = pipeline_builder.add_stage('Pipeline Finisher Executor')
        pipeline_finisher.set_attributes(preconditions=[
            "${record:value('/change[0]/columnvalues[0]') == 219"
            " or record:value('/change[0]/columnvalues[0]') == 319"
            " or record:value('/change[0]/columnvalues[0]') == 419}"])
        postgresql_cdc_client >> [wiretap.destination, pipeline_finisher]
        pipeline = pipeline_builder.build().configure_for_environment(database)
        sdc_executor.add_pipeline(pipeline)
        sdc_executor.start_pipeline(pipeline)

        if create_slot:
            # We manually created the slot in the test so the data that we inserted after it will be available
            expected_data = SAMPLE_DATA_2
        else:
            # Since the pipeline has created the replication slot we are inserting data now to make it available
            expected_data = SAMPLE_DATA_4
            sdc_executor.wait_for_pipeline_status(pipeline, 'RUNNING', timeout_sec=120)
            # insert first batch of data
            with database.engine.connect().execution_options(autocommit=True) as connection:
                for row in SAMPLE_DATA_4:
                    connection.execute(table.insert(), row)

        # Pipeline will stop once it sees id=219 or 419 if the pipelines creates the replication slot
        sdc_executor.wait_for_pipeline_status(pipeline, 'FINISHED', timeout_sec=120)

        # Since we stop gracefully, we expect to see the entire first batch (records with id=0 through id=9)
        # written to the destination.
        # Within the field, column names are stored in a list (e.g. ['id', 'name']) and so are
        # column values (e.g. [1, 'Martin_1']). We use zip to help us combine each instance into a dictionary.
        assert [dict(zip(record.field['change'][0]['columnnames'], record.field['change'][0]['columnvalues']))
                for record in wiretap.output_records] == expected_data

        wiretap.reset()

        # insert second batch of data
        with database.engine.connect().execution_options(autocommit=True) as connection:
            for row in SAMPLE_DATA_3:
                connection.execute(table.insert(), row)

        sdc_executor.start_pipeline(pipeline)

        # Pipeline will stop once it sees id=319.
        sdc_executor.wait_for_pipeline_status(pipeline, 'FINISHED', timeout_sec=120)

        assert [dict(zip(record.field['change'][0]['columnnames'], record.field['change'][0]['columnvalues']))
                for record in wiretap.output_records] == SAMPLE_DATA_3

    finally:
        if sdc_executor.get_pipeline_status(pipeline).response.json().get('status') == 'RUNNING':
            sdc_executor.stop_pipeline(pipeline=pipeline, force=True)
        table.drop(database.engine)
        database.deactivate_and_drop_replication_slot(replication_slot)
Exemplo n.º 7
0
def test_parse_datetimes(sdc_builder, sdc_executor, database,
                         stage_attributes):
    if not database.is_cdc_enabled:
        pytest.skip('Test only runs against PostgreSQL with CDC enabled.')
    table_name = get_random_string(string.ascii_lowercase, 20)
    start_date = '09-09-2017 10:10:20'

    date_time_format = '%m-%d-%Y %H:%M:%S'
    date_format = '%m-%d-%Y'
    time_format = '%H:%M:%S'

    db_date_output_format = '%Y-%m-%d'
    db_date_time_input_format = '%m-%d-%Y %H:%M:%S'
    db_date_time_output_format = '%Y-%m-%d %H:%M:%S'
    db_date_time_tz_output_format = '%Y-%m-%d %H:%M:%S%z'
    db_time_output_format = '%H:%M:%S'
    db_time_tz_output_format = '%H:%M:%S%z'

    replication_slot = get_random_string(string.ascii_lowercase, 10)
    stage_attributes.update({
        'max_batch_size_in_records': 1,
        'replication_slot': replication_slot,
        'poll_interval': POLL_INTERVAL,
        'initial_change': 'DATE',
        'start_date': start_date
    })
    try:
        # Create table and then perform some operations to simulate activity
        table = _create_table_in_database(table_name, database)

        stage_attributes.update({'tables': [{"table": table_name}]})

        connection = database.engine.connect()

        postgres_cdc_client, pipeline, wiretap = get_postgres_cdc_client_to_wiretap_pipeline(
            sdc_builder, database, stage_attributes)

        sdc_executor.add_pipeline(pipeline)

        sdc_executor.start_pipeline(pipeline)

        _insert(connection=connection, table=table)

        record_data_test_date = []
        record_data_test_datetime = []
        record_data_test_datetime_tz = []
        record_data_test_time = []
        record_data_test_time_tz = []
        for record in wiretap.output_records:
            record_change_data = record.get_field_data('change')
            for change in record_change_data:
                columnnames = change.get('columnnames')
                if columnnames is not None:
                    columnindex = columnnames.index('test_date')
                    record_data_test_date.append(
                        ('test_date', change.get('columnvalues')[columnindex]))
                    columnindex = columnnames.index('test_datetime')
                    record_data_test_datetime.append(
                        ('test_datetime',
                         change.get('columnvalues')[columnindex]))
                    columnindex = columnnames.index('test_datetime_tz')
                    record_data_test_datetime_tz.append(
                        ('test_datetime_tz',
                         change.get('columnvalues')[columnindex]))
                    columnindex = columnnames.index('test_time')
                    record_data_test_time.append(
                        ('test_time', change.get('columnvalues')[columnindex]))
                    columnindex = columnnames.index('test_time_tz')
                    record_data_test_time_tz.append(
                        ('test_time_tz',
                         change.get('columnvalues')[columnindex]))

        sdc_executor.stop_pipeline(pipeline=pipeline).wait_for_stopped(
            timeout_sec=60)

        if stage_attributes['parse_datetimes']:
            assert record_data_test_date == [
                ('test_date', datetime.strptime(row['test_date'], date_format))
                for row in INSERT_ROWS
            ]
            assert record_data_test_datetime == [
                ('test_datetime',
                 datetime.strptime(row['test_datetime'], date_time_format))
                for row in INSERT_ROWS
            ]
            assert record_data_test_datetime_tz == [
                ('test_datetime_tz',
                 datetime.strptime(row['test_datetime_tz'],
                                   date_time_format).replace(
                                       tzinfo=pytz.utc).isoformat().replace(
                                           '+00:00', 'Z'))
                for row in INSERT_ROWS
            ]
            assert record_data_test_time == [
                ('test_time',
                 datetime.strptime(row['test_time'],
                                   time_format).replace(year=1970))
                for row in INSERT_ROWS
            ]
            assert record_data_test_time_tz == [
                ('test_time_tz',
                 datetime.strptime(row['test_time_tz'],
                                   time_format).replace(year=1970))
                for row in INSERT_ROWS
            ]
        else:
            assert record_data_test_date == [
                ('test_date', datetime.strptime(
                    row['test_date'],
                    date_format).strftime(db_date_output_format))
                for row in INSERT_ROWS
            ]
            assert record_data_test_datetime == [
                ('test_datetime',
                 datetime.strptime(row['test_datetime'],
                                   db_date_time_input_format).strftime(
                                       db_date_time_output_format))
                for row in INSERT_ROWS
            ]
            assert record_data_test_datetime_tz == [
                ('test_datetime_tz',
                 datetime.strptime(
                     row['test_datetime_tz'],
                     db_date_time_input_format).replace(tzinfo=pytz.utc).
                 strftime(db_date_time_tz_output_format)[0:22])
                for row in INSERT_ROWS
            ]
            assert record_data_test_time == [
                ('test_time',
                 datetime.strptime(row['test_time'], time_format).replace(
                     year=1970).strftime(db_time_output_format))
                for row in INSERT_ROWS
            ]
            assert record_data_test_time_tz == [
                ('test_time_tz',
                 datetime.strptime(row['test_time_tz'], time_format).replace(
                     year=1970).replace(tzinfo=pytz.utc).strftime(
                         db_time_tz_output_format)[0:11])
                for row in INSERT_ROWS
            ]

    finally:
        if sdc_executor.get_pipeline_status(pipeline).response.json().get(
                'status') == 'RUNNING':
            sdc_executor.stop_pipeline(pipeline)
        if table is not None:
            table.drop(database.engine)
            logger.info('Table: %s dropped.', table_name)
        database.deactivate_and_drop_replication_slot(replication_slot)
        sdc_executor.remove_pipeline(pipeline)
Exemplo n.º 8
0
def test_stop_start(sdc_builder, sdc_executor, database, poll_interval):
    """Records are neither dropped, nor duplicated when a pipeline is stopped and then started in
    the midst of ingesting data. Repeat this a couple of times, and inbetween restart with no data
    and make sure offset can be read properly back.

    Runs with two poll intervals to verify that the Batch Wait Time (ms) configuration is respected.
    """
    if not database.is_cdc_enabled:
        pytest.skip('Test only runs against PostgreSQL with CDC enabled.')

    SAMPLE_DATA = [dict(id=i, name=f'Martin_{i}') for i in range(40)]
    table_name = get_random_string(string.ascii_lowercase, 20)
    table = sqlalchemy.Table(table_name,
                             sqlalchemy.MetaData(),
                             sqlalchemy.Column('id', sqlalchemy.Integer, primary_key=True),
                             sqlalchemy.Column('name', sqlalchemy.String(20)))
    replication_slot = get_random_string(string.ascii_lowercase, 10)

    try:
        pipeline_builder = sdc_builder.get_pipeline_builder()
        postgresql_cdc_client = pipeline_builder.add_stage('PostgreSQL CDC Client')
        postgresql_cdc_client.set_attributes(batch_wait_time_in_ms=10000,
                                             max_batch_size_in_records=10,
                                             poll_interval=poll_interval,
                                             replication_slot=replication_slot)
        wiretap = pipeline_builder.add_wiretap()
        pipeline_finisher = pipeline_builder.add_stage('Pipeline Finisher Executor')
        # We want the pipeline to stop automatically part-way through processing batch 1 and at the end of batch 2.
        pipeline_finisher.set_attributes(preconditions=[
            "${record:value('/change[0]/columnvalues[0]') == 9"
            " or record:value('/change[0]/columnvalues[0]') == 19"
            " or record:value('/change[0]/columnvalues[0]') == 29"
            " or record:value('/change[0]/columnvalues[0]') == 39}"
        ])
        postgresql_cdc_client >> [wiretap.destination, pipeline_finisher]
        pipeline = pipeline_builder.build().configure_for_environment(database)
        sdc_executor.add_pipeline(pipeline)

        # Start pipeline and add some data
        sdc_executor.start_pipeline(pipeline)

        table.create(database.engine)
        with database.engine.connect().execution_options(autocommit=True) as connection:
            for row in SAMPLE_DATA[:30]:
                connection.execute(table.insert(), row)

        # Pipeline will stop once it sees id=5.
        sdc_executor.wait_for_pipeline_status(pipeline, 'FINISHED')

        # Since we stop gracefully, we expect to see the entire first batch (records with id=0 through id=9)
        # written to the destination.
        # Within the field, column names are stored in a list (e.g. ['id', 'name']) and so are
        # column values (e.g. [1, 'Martin_1']). We use zip to help us combine each instance into a dictionary.
        assert [dict(zip(record.field['change'][0]['columnnames'], record.field['change'][0]['columnvalues']))
                for record in wiretap.output_records] == SAMPLE_DATA[:10]
        # Reset the wiretap so that we don't see records we've collected up to this point when we access it next time.
        wiretap.reset()
        logger.info('Starting pipeline for the second time ...')
        # Again, pipeline will stop on its own, this time when it processes the last record (id=19).
        sdc_executor.start_pipeline(pipeline).wait_for_finished()
        # We expect to see records with id=10 through id=19 (i.e. no duplicated or missing records).
        assert [dict(zip(record.field['change'][0]['columnnames'], record.field['change'][0]['columnvalues']))
                for record in wiretap.output_records] == SAMPLE_DATA[10:20]

        # Reset the wiretap so that we don't see records we've collected up to this point when we access it next time.
        wiretap.reset()
        logger.info('Starting pipeline for the second time ...')
        # Again, pipeline will stop on its own, this time when it processes the last record (id=19).
        sdc_executor.start_pipeline(pipeline).wait_for_finished()
        # We expect to see records with id=10 through id=19 (i.e. no duplicated or missing records).
        assert [dict(zip(record.field['change'][0]['columnnames'], record.field['change'][0]['columnvalues']))
                for record in wiretap.output_records] == SAMPLE_DATA[20:30]

        # Reset the wiretap so that we don't see records we've collected up to this point when we access it next time.
        wiretap.reset()
        logger.info('Starting pipeline for the third time ...')
        # Don't insert records and get an empty batch
        sdc_executor.start_pipeline(pipeline).wait_for_pipeline_batch_count(1)
        sdc_executor.stop_pipeline(pipeline)
        metrics = sdc_executor.get_pipeline_history(pipeline).latest.metrics
        assert metrics.counter("pipeline.batchOutputRecords.counter").count == 0

        # Add few records
        with database.engine.connect().execution_options(autocommit=True) as connection:
            for row in SAMPLE_DATA[30:40]:
                connection.execute(table.insert(), row)
        # Reset the wiretap so that we don't see records we've collected up to this point when we access it next time.
        wiretap.reset()
        logger.info('Starting pipeline for the fourth time ...')
        sdc_executor.start_pipeline(pipeline).wait_for_finished()
        # We expect to see records with id=10 through id=19 (i.e. no duplicated or missing records).
        assert [dict(zip(record.field['change'][0]['columnnames'], record.field['change'][0]['columnvalues']))
                for record in wiretap.output_records] == SAMPLE_DATA[30:40]
    finally:
        table.drop(database.engine)
        database.deactivate_and_drop_replication_slot(replication_slot)
Exemplo n.º 9
0
def test_data_types(sdc_builder, sdc_executor, database, data_type,
                    insert_fragment, expected_type, expected_value, keep_data):
    if not database.is_cdc_enabled:
        pytest.skip(
            'Test only runs against Aurora PostgreSQL with CDC enabled.')

    table_name = get_random_string(string.ascii_lowercase, 20)
    connection = database.engine.connect()

    builder = sdc_builder.get_pipeline_builder()

    origin = builder.add_stage('Aurora PostgreSQL CDC Client')
    replication_slot_name = get_random_string(string.ascii_lowercase, 10)
    origin.set_attributes(remove_replication_slot_on_close=True,
                          max_batch_size_in_records=1,
                          poll_interval=POLL_INTERVAL,
                          replication_slot=replication_slot_name)

    wiretap = builder.add_wiretap()

    origin >> wiretap.destination

    pipeline = builder.build().configure_for_environment(database)

    try:
        # Create table
        connection.execute(f"""
            CREATE TABLE {table_name}(
                id int primary key,
                data_column {data_type} NULL
            )
        """)

        sdc_executor.add_pipeline(pipeline)
        sdc_executor.start_pipeline(pipeline)

        # And insert a row with actual value
        connection.execute(
            f"INSERT INTO {table_name} VALUES(1, {insert_fragment})")
        # And a null
        connection.execute(f"INSERT INTO {table_name} VALUES(2, NULL)")

        sdc_executor.wait_for_pipeline_metric(pipeline,
                                              'input_record_count',
                                              2,
                                              timeout_sec=300)
        sdc_executor.stop_pipeline(pipeline)

        records = wiretap.output_records
        assert len(records) == 2

        record = records[0].field['change'][0]
        null_record = records[1].field['change'][0]
        index_of_data_type = record.get('columnnames').index('data_column')

        # Since we are controlling types, we want to check explicit values inside the record rather the the python
        # wrappers.

        assert record.get(
            'columntypes')[index_of_data_type] == expected_type.lower()
        assert null_record.get(
            'columntypes')[index_of_data_type] == expected_type.lower()

        assert record.get(
            'columnvalues')[index_of_data_type].value == expected_value
        assert null_record.get(
            'columnvalues')[index_of_data_type].value is None

    finally:
        if sdc_executor.get_pipeline_status(pipeline).response.json().get(
                'status') == 'RUNNING':
            sdc_executor.stop_pipeline(pipeline=pipeline, force=True)
        if not keep_data:
            logger.info('Dropping table %s in %s database ...', table_name,
                        database.type)
            database.deactivate_and_drop_replication_slot(
                replication_slot_name)
            connection.execute(f'DROP TABLE {table_name}')
        connection.close()
Exemplo n.º 10
0
def test_resume_offset(sdc_builder, sdc_executor, database, keep_data):
    if not database.is_cdc_enabled:
        pytest.skip(
            'Test only runs against Aurora PostgreSQL with CDC enabled.')

    iterations = 3
    records_per_iteration = 10
    connection = database.engine.connect()
    table_name = get_random_string(string.ascii_lowercase, 20)

    table = sqlalchemy.Table(table_name,
                             sqlalchemy.MetaData(),
                             sqlalchemy.Column('id',
                                               sqlalchemy.Integer,
                                               primary_key=True,
                                               autoincrement=False),
                             schema=DEFAULT_SCHEMA_NAME)

    builder = sdc_builder.get_pipeline_builder()

    origin = builder.add_stage('Aurora PostgreSQL CDC Client')
    replication_slot_name = get_random_string(string.ascii_lowercase, 10)
    origin.set_attributes(remove_replication_slot_on_close=True,
                          max_batch_size_in_records=1,
                          poll_interval=POLL_INTERVAL,
                          replication_slot=replication_slot_name)

    delay = builder.add_stage('Delay')
    delay.set_attributes(delay_between_batches=5000)  # 5 seconds

    wiretap = builder.add_wiretap()

    origin >> delay >> wiretap.destination

    pipeline = builder.build().configure_for_environment(database)
    sdc_executor.add_pipeline(pipeline)

    try:
        logger.info('Creating table %s', table_name)
        table.create(connection.engine)

        for iteration in range(0, iterations):
            logger.info(f"Iteration: {iteration}")
            wiretap.reset()

            sdc_executor.start_pipeline(pipeline)

            logger.info('Inserting data into %s', table_name)
            for n in range(
                    iteration * records_per_iteration + 1,
                    iteration * records_per_iteration + 1 +
                    records_per_iteration):
                connection.execute(table.insert(), {'id': n})

            sdc_executor.wait_for_pipeline_metric(pipeline,
                                                  'input_record_count',
                                                  records_per_iteration,
                                                  timeout_sec=60)

            records = wiretap.output_records
            sdc_executor.stop_pipeline(pipeline)

            # We should get the right number of records
            assert len(records) == records_per_iteration

            expected_number = iteration * records_per_iteration + 1

            for r in records:
                record = r.field['change'][0]
                assert record.get('columnvalues')[record.get(
                    'columnnames').index('id')].value == expected_number
                expected_number = expected_number + 1
    finally:
        if sdc_executor.get_pipeline_status(pipeline).response.json().get(
                'status') == 'RUNNING':
            sdc_executor.stop_pipeline(pipeline=pipeline, force=True)
        if not keep_data:
            logger.info('Dropping table %s in %s database ...', table_name,
                        database.type)
            database.deactivate_and_drop_replication_slot(
                replication_slot_name)
            connection.execute(f'DROP TABLE {table_name}')
        connection.close()
Exemplo n.º 11
0
def test_multiple_batches(sdc_builder, sdc_executor, database, keep_data):
    if not database.is_cdc_enabled:
        pytest.skip(
            'Test only runs against Aurora PostgreSQL with CDC enabled.')

    connection = database.engine.connect()
    max_batch_size = 50
    batches = 5

    table_name = get_random_string(string.ascii_lowercase, 20)

    metadata = sqlalchemy.MetaData()
    table = sqlalchemy.Table(table_name,
                             metadata,
                             sqlalchemy.Column('id',
                                               sqlalchemy.Integer,
                                               primary_key=True,
                                               quote=True),
                             quote=True)

    builder = sdc_builder.get_pipeline_builder()

    origin = builder.add_stage('Aurora PostgreSQL CDC Client')
    replication_slot_name = get_random_string(string.ascii_lowercase, 10)
    origin.set_attributes(remove_replication_slot_on_close=True,
                          max_batch_size_in_records=max_batch_size,
                          poll_interval=POLL_INTERVAL,
                          replication_slot=replication_slot_name)

    wiretap = builder.add_wiretap()

    origin >> wiretap.destination

    pipeline = builder.build().configure_for_environment(database)
    sdc_executor.add_pipeline(pipeline)

    try:
        logger.info('Creating table %s', table_name)
        table.create(connection.engine)

        sdc_executor.start_pipeline(pipeline)

        logger.info('Inserting data into %s', table_name)
        for n in range(1, max_batch_size * batches + 1):
            connection.execute(table.insert(), {'id': n})

        sdc_executor.wait_for_pipeline_metric(pipeline, 'output_record_count',
                                              max_batch_size * batches + 1)
        sdc_executor.stop_pipeline(pipeline)

        records_changes = wiretap.output_records
        assert len(records_changes) == max_batch_size * batches
        records = [r.field['change'][0] for r in records_changes]

        # Get the id's from the records
        index_of_id = records[0].get('columnnames').index('id')
        id_records = [r.get('columnvalues')[index_of_id] for r in records]

        expected_number = 1
        for id_record in id_records:
            assert id_record == expected_number
            expected_number = expected_number + 1
    finally:
        if sdc_executor.get_pipeline_status(pipeline).response.json().get(
                'status') == 'RUNNING':
            sdc_executor.stop_pipeline(pipeline=pipeline, force=True)
        if not keep_data:
            logger.info('Dropping table %s in %s database ...', table_name,
                        database.type)
            database.deactivate_and_drop_replication_slot(
                replication_slot_name)
            connection.execute(f'DROP TABLE {table_name}')
        connection.close()
Exemplo n.º 12
0
def test_object_names(sdc_builder, sdc_executor, database, test_name,
                      table_name, offset_name, keep_data):
    if not database.is_cdc_enabled:
        pytest.skip(
            'Test only runs against Aurora PostgreSQL with CDC enabled.')

    connection = database.engine.connect()
    builder = sdc_builder.get_pipeline_builder()

    origin = builder.add_stage('Aurora PostgreSQL CDC Client')
    replication_slot_name = get_random_string(string.ascii_lowercase, 10)
    origin.set_attributes(remove_replication_slot_on_close=True,
                          max_batch_size_in_records=1,
                          poll_interval=POLL_INTERVAL,
                          replication_slot=replication_slot_name)

    wiretap = builder.add_wiretap()

    origin >> wiretap.destination

    pipeline = builder.build().configure_for_environment(database)

    # For make Aurora PostgreSQL case-sensitive, the objects names should be between double-quotes
    table_name_quotes = '"' + table_name + '"'
    offset_name_quotes = '"' + offset_name + '"'

    try:
        logger.info('Creating table %s in %s database ...', table_name,
                    database.type)
        connection.execute(f"""
            CREATE TABLE {table_name_quotes} ( 
                {offset_name_quotes} int primary key
            )
        """)

        sdc_executor.add_pipeline(pipeline)
        sdc_executor.start_pipeline(pipeline)

        logger.info('Adding data into %s database ...', database.type)
        connection.execute(f"INSERT INTO {table_name_quotes} VALUES(1)")

        sdc_executor.wait_for_pipeline_metric(pipeline,
                                              'input_record_count',
                                              1,
                                              timeout_sec=300)
        sdc_executor.stop_pipeline(pipeline)

        records = wiretap.output_records
        assert len(records) == 1

        record = records[0].field['change'][0]

        # SDC Will escape field names with certain characters, but not always...
        if "$" in offset_name:
            assert record.get('columnnames')[0] == f'{offset_name}'
        else:
            assert record.get('columnnames')[0] == offset_name
    finally:
        if sdc_executor.get_pipeline_status(pipeline).response.json().get(
                'status') == 'RUNNING':
            sdc_executor.stop_pipeline(pipeline=pipeline, force=True)
        if not keep_data:
            logger.info('Dropping table %s in %s database ...', table_name,
                        database.type)
            database.deactivate_and_drop_replication_slot(
                replication_slot_name)
            connection.execute(f'DROP TABLE {table_name_quotes}')
        connection.close()
Exemplo n.º 13
0
def test_data_types_as_primary_keys_serial_numeric(sdc_builder, sdc_executor,
                                                   database, data_type,
                                                   insert_fragment,
                                                   expected_type,
                                                   expected_value, keep_data):
    if not database.is_cdc_enabled:
        pytest.skip(
            'Test only runs against Aurora PostgreSQL with CDC enabled.')

    raw_data = ['The Hobbit', 'Tolkien', 'The Community']
    table_name = get_random_string(string.ascii_lowercase, 20)
    sequence_name = get_random_string(string.ascii_lowercase, 5)
    connection = database.engine.connect()

    builder = sdc_builder.get_pipeline_builder()

    origin = builder.add_stage('Aurora PostgreSQL CDC Client')
    replication_slot_name = get_random_string(string.ascii_lowercase, 10)
    origin.set_attributes(remove_replication_slot_on_close=True,
                          max_batch_size_in_records=1,
                          poll_interval=POLL_INTERVAL,
                          replication_slot=replication_slot_name)

    wiretap = builder.add_wiretap()

    origin >> wiretap.destination

    pipeline = builder.build().configure_for_environment(database)

    try:
        # Create table
        connection.execute(f"""
            CREATE TABLE {table_name}(
                id {data_type} primary key,
                name VARCHAR(20)
            )
        """)

        # Create sequence
        connection.execute(
            f"CREATE SEQUENCE {sequence_name} start 1 increment 1")

        sdc_executor.add_pipeline(pipeline)
        sdc_executor.start_pipeline(pipeline)

        # And insert a row with actual value
        data = ', '.join(
            [f"(nextval(\'{sequence_name}\'), \'{d}\')" for d in raw_data])
        query = f"INSERT INTO {table_name} (id, name) VALUES {data};"
        connection.execute(query)

        sdc_executor.wait_for_pipeline_metric(pipeline,
                                              'input_record_count',
                                              1,
                                              timeout_sec=300)
        sdc_executor.stop_pipeline(pipeline)

        records = wiretap.output_records[0].field['change']
        assert len(records) == expected_value

        # Since we are controlling types, we want to check explicit values inside the record rather the the python
        # wrappers.
        index_of_primary_key = records[0].get('columnnames').index('id')
        index_of_name = records[0].get('columnnames').index('name')

        expected_id = 1
        for i in range(len(records)):
            record = records[i]

            # Assert the type and value of the primary key column 'id'
            assert record.get(
                'columntypes')[index_of_primary_key] == expected_type.lower()
            assert record.get(
                'columnvalues')[index_of_primary_key].value == expected_id
            expected_id = expected_id + 1

            # Assert the value of the column 'name'
            assert record.get('columnvalues')[index_of_name] == raw_data[i]

    finally:
        if sdc_executor.get_pipeline_status(pipeline).response.json().get(
                'status') == 'RUNNING':
            sdc_executor.stop_pipeline(pipeline=pipeline, force=True)
        if not keep_data:
            logger.info('Dropping table %s in %s database ...', table_name,
                        database.type)
            database.deactivate_and_drop_replication_slot(
                replication_slot_name)
            connection.execute(f'DROP TABLE {table_name}')
        connection.close()