Exemplo n.º 1
0
    def test_run(self):
        conn_id = connections.ensure_connection(self)

        #run in check mode
        check_job_name = runner.run_check_mode(self, conn_id)

        #verify check  exit codes
        exit_status = menagerie.get_exit_status(conn_id, check_job_name)
        menagerie.verify_check_exit_status(self, exit_status, check_job_name)

        found_catalogs = menagerie.get_catalogs(conn_id)
        self.assertGreater(
            len(found_catalogs),
            0,
            msg="unable to locate schemas for connection {}".format(conn_id))

        found_catalog_names = set(
            map(lambda c: c['tap_stream_id'], found_catalogs))

        diff = self.expected_check_streams().symmetric_difference(
            found_catalog_names)
        self.assertEqual(
            len(diff),
            0,
            msg="discovered schemas do not match: {}".format(diff))
        print("discovered schemas are OK")

        #select all catalogs

        for c in found_catalogs:
            catalog_entry = menagerie.get_annotated_schema(
                conn_id, c['stream_id'])

            for k in self.expected_automatic_fields()[c['stream_name']]:
                mdata = next(
                    (m for m in catalog_entry['metadata']
                     if len(m['breadcrumb']) == 2 and m['breadcrumb'][1] == k),
                    None)
                print("Validating inclusion on {}: {}".format(
                    c['stream_name'], mdata))
                self.assertTrue(
                    mdata and mdata['metadata']['inclusion'] == 'automatic')

            connections.select_catalog_and_fields_via_metadata(
                conn_id, c, catalog_entry)

        #clear state
        menagerie.set_state(conn_id, {})

        sync_job_name = runner.run_sync_mode(self, conn_id)

        #verify tap and target exit codes
        exit_status = menagerie.get_exit_status(conn_id, sync_job_name)
        menagerie.verify_sync_exit_status(self, exit_status, sync_job_name)

        first_record_count_by_stream = runner.examine_target_output_file(
            self, conn_id, self.expected_sync_streams(), self.expected_pks())
        replicated_row_count = reduce(lambda accum, c: accum + c,
                                      first_record_count_by_stream.values())
        self.assertGreater(replicated_row_count,
                           0,
                           msg="failed to replicate any data: {}".format(
                               first_record_count_by_stream))
        print("total replicated row count: {}".format(replicated_row_count))

        # Verify that automatic fields are all emitted with records
        synced_records = runner.get_records_from_target_output()
        for stream_name, data in synced_records.items():
            record_messages = [
                set(row['data'].keys()) for row in data['messages']
            ]
            for record_keys in record_messages:
                self.assertEqual(
                    self.expected_automatic_fields().get(stream_name, set()) -
                    record_keys, set())

        # Verify bookmarks were saved for all streams
        # TODO: No bookmarks for now, though `actions` at least will be
        state = menagerie.get_state(conn_id)
        # TODO: make more generic check for assertions in state
        self.assertTrue(
            state.get('bookmarks', {}).get('actions', {}).get('window_start'))
        #for stream in self.expected_sync_streams():
        #    self.assertTrue(stream_states.get(stream))

        print("Bookmarks are accurate, running second sync...")

        # Run another sync
        second_sync_job_name = runner.run_sync_mode(self, conn_id)

        #verify tap and target exit codes
        exit_status = menagerie.get_exit_status(conn_id, sync_job_name)
        menagerie.verify_sync_exit_status(self, exit_status, sync_job_name)

        second_record_count_by_stream = runner.examine_target_output_file(
            self, conn_id, self.expected_sync_streams(), self.expected_pks())
        for stream in self.expected_full_table_sync_streams():
            record_count = second_record_count_by_stream.get(stream, 0)
            # Assert we have enough data
            self.assertGreater(record_count, 0)
            # Assert that our bookmark works as expected
            self.assertEqual(record_count,
                             first_record_count_by_stream[stream])
            # self.assertEqual(record_count, 1)

        for stream in self.expected_incremental_sync_streams():
            record_count = second_record_count_by_stream.get(stream, 0)
            # We aren't generating data between the two syncs, and the
            # bookmark should be a little behind 'now', so the second sync
            # should return no data
            self.assertEqual(record_count, 0)

        print("Second sync record count is OK.")
    def test_run(self):
        print("\n\nRUNNING {}\n\n".format(self.name()))

        # Initialize start_date state to make assertions
        self.START_DATE = self.get_properties().get('start_date')
        start_date_1 = self.START_DATE  # default
        start_date_2 = self.timedelta_formatted(self.START_DATE,
                                                2)  # default + 2 days

        # get expected records
        expected_records_1 = {x: []
                              for x in self.expected_streams()
                              }  # ids by stream
        for stream in self.testable_streams():
            existing_objects = self.client.get_all(stream)
            assert existing_objects, "Test data is not properly set for {}, test will fail.".format(
                stream)
            print("Data exists for stream: {}".format(stream))
            for obj in existing_objects:
                expected_records_1[stream].append(obj)

            # If no objects exist since the 2nd start_date, create one
            data_in_range = False
            for obj in expected_records_1.get(stream):
                created = obj.get('created_date').replace(
                    ' ', 'T',
                    1) + 'Z'  # 2016-06-02 19:57:10 -->> 2016-06-02T19:57:10Z
                assert created, "'created_date' is not an attribute of {}".format(
                    obj.get('name'))
                if self.parse_date(created) > self.parse_date(start_date_2):
                    data_in_range = True
                    break
            if not data_in_range:
                if stream in self.testable_streams():
                    expected_records_1[stream].append(
                        self.client.create(stream))
                    continue
                assert None, "Sufficient test data does not exist for {}, test will fail.".format(
                    stream)

        ##########################################################################
        ### First Sync
        ##########################################################################

        conn_id = connections.ensure_connection(self)

        # run in check mode
        check_job_name = runner.run_check_mode(self, conn_id)

        # verify check exit codes
        exit_status = menagerie.get_exit_status(conn_id, check_job_name)
        menagerie.verify_check_exit_status(self, exit_status, check_job_name)

        found_catalogs = menagerie.get_catalogs(conn_id)
        self.assertGreater(
            len(found_catalogs),
            0,
            msg="unable to locate schemas for connection {}".format(conn_id))

        found_catalog_names = set(
            map(lambda c: c['tap_stream_id'], found_catalogs))
        diff = self.expected_check_streams().symmetric_difference(
            found_catalog_names)
        self.assertEqual(
            len(diff),
            0,
            msg="discovered schemas do not match: {}".format(diff))
        print("discovered schemas are OK")

        # Select all available streams and their fields
        self.select_all_streams_and_fields(conn_id=conn_id,
                                           catalogs=found_catalogs)

        catalogs = menagerie.get_catalogs(conn_id)

        #clear state
        menagerie.set_state(conn_id, {})

        # Run sync 1
        sync_job_1 = runner.run_sync_mode(self, conn_id)

        # Verify tap exit codes
        exit_status_1 = menagerie.get_exit_status(conn_id, sync_job_1)
        menagerie.verify_sync_exit_status(self, exit_status_1, sync_job_1)

        # read target output
        record_count_by_stream_1 = runner.examine_target_output_file(
            self, conn_id, self.expected_streams(),
            self.expected_primary_keys())
        replicated_row_count_1 = reduce(lambda accum, c: accum + c,
                                        record_count_by_stream_1.values())
        self.assertGreater(replicated_row_count_1,
                           0,
                           msg="failed to replicate any data: {}".format(
                               record_count_by_stream_1))
        print("total replicated row count: {}".format(replicated_row_count_1))
        synced_records_1 = runner.get_records_from_target_output()

        state_1 = menagerie.get_state(conn_id)

        ##########################################################################
        ### Update START DATE Between Syncs
        ##########################################################################

        self.START_DATE = start_date_2
        print("REPLICATION START DATE CHANGE: {} ===>>> {} ".format(
            start_date_1, start_date_2))
        self.END_DATE = self.get_properties()['end_date']

        ##########################################################################
        ### Second Sync
        ##########################################################################

        # create a new connection with the new start_date
        conn_id = connections.ensure_connection(self,
                                                original_properties=False)

        #run in check mode
        check_job_name = runner.run_check_mode(self, conn_id)

        #verify check exit codes
        exit_status = menagerie.get_exit_status(conn_id, check_job_name)
        menagerie.verify_check_exit_status(self, exit_status, check_job_name)

        found_catalogs = menagerie.get_catalogs(conn_id)
        self.assertGreater(
            len(found_catalogs),
            0,
            msg="unable to locate schemas for connection {}".format(conn_id))

        found_catalog_names = set(
            map(lambda c: c['tap_stream_id'], found_catalogs))

        diff = self.expected_check_streams().symmetric_difference(
            found_catalog_names)
        self.assertEqual(
            len(diff),
            0,
            msg="discovered schemas do not match: {}".format(diff))
        print("discovered schemas are kosher")

        # Select all available streams and their fields
        self.select_all_streams_and_fields(conn_id=conn_id,
                                           catalogs=found_catalogs)

        catalogs = menagerie.get_catalogs(conn_id)

        # clear state
        menagerie.set_state(conn_id, {})

        # Run sync 2
        sync_job_2 = runner.run_sync_mode(self, conn_id)

        # verify tap and target exit codes
        exit_status_2 = menagerie.get_exit_status(conn_id, sync_job_2)
        menagerie.verify_sync_exit_status(self, exit_status_2, sync_job_2)

        # This should be validating the the PKs are written in each record
        record_count_by_stream_2 = runner.examine_target_output_file(
            self, conn_id, self.expected_streams(),
            self.expected_primary_keys())
        replicated_row_count_2 = reduce(lambda accum, c: accum + c,
                                        record_count_by_stream_2.values(), 0)
        self.assertGreater(replicated_row_count_2,
                           0,
                           msg="failed to replicate any data")
        print("total replicated row count: {}".format(replicated_row_count_2))

        synced_records_2 = runner.get_records_from_target_output()

        state_2 = menagerie.get_state(conn_id)

        for stream in self.testable_streams():
            with self.subTest(stream=stream):
                replication_type = self.expected_replication_method().get(
                    stream)
                record_count_1 = record_count_by_stream_1.get(stream, 0)
                record_count_2 = record_count_by_stream_2.get(stream, 0)

                # Testing how FULL TABLE streams handle start date
                if replication_type == self.FULL:

                    # Verify that a bookmark doesn't exist for the stream.
                    self.assertTrue(
                        state_1.get(stream) is None,
                        msg="There should not be bookmark value for {}\n{}".
                        format(stream, state_1.get(stream)))
                    self.assertTrue(
                        state_2.get(stream) is None,
                        msg="There should not be bookmark value for {}\n{}".
                        format(stream, state_1.get(stream)))

                    # Verify that the 2nd sync includes the same number of records as the 1st sync.
                    # -> Currently full table does not obey start_date, which makes this assertion valid
                    self.assertEqual(
                        record_count_2,
                        record_count_1,
                        msg="\nStream '{}' is {}\n".format(stream, self.FULL) +
                        "Record counts should be equal, but are not\n" +
                        "Sync 1 start_date: {} ".format(start_date_1) +
                        "Sync 1 record_count: {}\n".format(record_count_1) +
                        "Sync 2 start_date: {} ".format(start_date_2) +
                        "Sync 2 record_count: {}".format(record_count_2))

                    # Verify all records in the 1st sync are included in the 2nd sync since
                    # 2nd sync has a later start date.
                    records_from_sync_1 = set(
                        row.get('data').get('eid')
                        for row in synced_records_1.get(stream, []).get(
                            'messages', []))
                    records_from_sync_2 = set(
                        row.get('data').get('eid')
                        for row in synced_records_2.get(stream, []).get(
                            'messages', []))
                    self.assertEqual(
                        set(),
                        records_from_sync_1.difference(records_from_sync_2),
                        msg="Sync 2 record(s) missing from Sync 1:\n{}".format(
                            records_from_sync_2.difference(
                                records_from_sync_1)))

                else:
                    raise Exception(
                        "Expectations are set incorrectly. {} cannot have a "
                        "replication method of {}".format(
                            stream, replication_type))
Exemplo n.º 3
0
    def test_run(self):
        (table_configs, conn_id, _) = self.pre_sync_test()

        # Select simple_coll_1 and simple_coll_2 streams and add replication method metadata
        found_catalogs = menagerie.get_catalogs(conn_id)
        for stream_catalog in found_catalogs:
            annotated_schema = menagerie.get_annotated_schema(
                conn_id, stream_catalog['stream_id'])
            additional_md = [{
                "breadcrumb": [],
                "metadata": {
                    'replication-method': 'FULL_TABLE'
                }
            }]
            connections.select_catalog_and_fields_via_metadata(
                conn_id, stream_catalog, annotated_schema, additional_md)

        # run full table sync
        sync_job_name = runner.run_sync_mode(self, conn_id)

        exit_status = menagerie.get_exit_status(conn_id, sync_job_name)
        menagerie.verify_sync_exit_status(self, exit_status, sync_job_name)

        # verify the persisted schema was correct
        records_by_stream = runner.get_records_from_target_output()
        expected_pks = {}

        for config in table_configs:
            key = {config['HashKey']}
            if config.get('SortKey'):
                key |= {config.get('SortKey')}
            expected_pks[config['TableName']] = key

        # assert that each of the streams that we synced are the ones that we expect to see
        record_count_by_stream = runner.examine_target_output_file(
            self, conn_id, {x['TableName']
                            for x in table_configs}, expected_pks)

        state = menagerie.get_state(conn_id)

        first_versions = {}

        # assert that we get the correct number of records for each stream
        for config in table_configs:
            table_name = config['TableName']

            self.assertEqual(config['num_rows'],
                             record_count_by_stream[table_name])

            # assert that an activate_version_message is first and last message sent for each stream
            self.assertEqual(
                'activate_version',
                records_by_stream[table_name]['messages'][0]['action'])
            self.assertEqual(
                'activate_version',
                records_by_stream[table_name]['messages'][-1]['action'])

            # assert that the state has an initial_full_table_complete == True
            self.assertTrue(
                state['bookmarks'][table_name]['initial_full_table_complete'])
            # assert that there is a version bookmark in state
            first_versions[table_name] = state['bookmarks'][table_name][
                'version']
            self.assertIsNotNone(first_versions[table_name])
Exemplo n.º 4
0
    def test_run(self):
        """
        Verify that a full sync can send capture all data and send it in the correct format
        for integer and boolean (bit) data.
        Verify that the fist sync sends an activate immediately.
        Verify that the table version is incremented up
        """
        print("running test {}".format(self.name()))

        conn_id = self.create_connection()

        # run in check mode
        check_job_name = runner.run_check_mode(self, conn_id)

        # verify check  exit codes
        exit_status = menagerie.get_exit_status(conn_id, check_job_name)
        menagerie.verify_check_exit_status(self, exit_status, check_job_name)

        # get the catalog information of discovery
        found_catalogs = menagerie.get_catalogs(conn_id)
        additional_md = [{
            "breadcrumb": [],
            "metadata": {
                'replication-method': 'LOG_BASED'
            }
        }]
        BaseTapTest.select_all_streams_and_fields(conn_id,
                                                  found_catalogs,
                                                  additional_md=additional_md)

        # clear state
        menagerie.set_state(conn_id, {})
        sync_job_name = runner.run_sync_mode(self, conn_id)

        # verify tap and target exit codes
        exit_status = menagerie.get_exit_status(conn_id, sync_job_name)
        menagerie.verify_sync_exit_status(self, exit_status, sync_job_name)

        # verify record counts of streams
        record_count_by_stream = runner.examine_target_output_file(
            self, conn_id, self.expected_streams(),
            self.expected_primary_keys_by_stream_id())
        expected_count = {
            k: len(v['values'])
            for k, v in self.expected_metadata().items()
        }
        # self.assertEqual(record_count_by_stream, expected_count)

        # verify records match on the first sync
        records_by_stream = runner.get_records_from_target_output()

        table_version = dict()
        for stream in self.expected_streams():
            with self.subTest(stream=stream):
                stream_expected_data = self.expected_metadata()[stream]
                table_version[stream] = records_by_stream[stream][
                    'table_version']

                # verify on the first sync you get
                # activate version message before and after all data for the full table
                # and before the logical replication part
                if records_by_stream[stream]['messages'][-1].get("data"):
                    last_row_data = True
                else:
                    last_row_data = False

                self.assertEqual(
                    records_by_stream[stream]['messages'][0]['action'],
                    'activate_version')
                self.assertEqual(
                    records_by_stream[stream]['messages'][-2]['action'],
                    'activate_version')
                if last_row_data:
                    self.assertEqual(
                        records_by_stream[stream]['messages'][-3]['action'],
                        'activate_version')
                else:
                    self.assertEqual(
                        records_by_stream[stream]['messages'][-1]['action'],
                        'activate_version')
                self.assertEqual(
                    len([
                        m for m in records_by_stream[stream]['messages'][1:]
                        if m["action"] == "activate_version"
                    ]),
                    2,
                    msg=
                    "Expect 2 more activate version messages for end of full table and beginning of log based"
                )

                column_names = [
                    list(field_data.keys())[0]
                    for field_data in stream_expected_data[self.FIELDS]
                ]

                expected_messages = [{
                    "action": "upsert",
                    "data": {
                        column: value
                        for column, value in list(
                            zip(column_names, stream_expected_data[self.VALUES]
                                [row]))
                    }
                } for row in range(len(stream_expected_data[self.VALUES]))]

                # Verify all data is correct for the full table part
                if last_row_data:
                    final_row = -3
                else:
                    final_row = -2

                for expected_row, actual_row in list(
                        zip(expected_messages, records_by_stream[stream]
                            ['messages'][1:final_row])):
                    with self.subTest(expected_row=expected_row):

                        self.assertEqual(actual_row["action"], "upsert")
                        self.assertEqual(
                            len(expected_row["data"].keys()),
                            len(actual_row["data"].keys()),
                            msg="there are not the same number of columns")
                        for column_name, expected_value in expected_row[
                                "data"].items():
                            if isinstance(expected_value, Decimal):
                                self.assertEqual(
                                    type(actual_row["data"][column_name]),
                                    Decimal,
                                    msg=
                                    "decimal value is not represented as a number"
                                )
                                self.assertEqual(
                                    expected_value,
                                    actual_row["data"][column_name],
                                    msg="expected: {} != actual {}".format(
                                        expected_row, actual_row))
                            else:
                                self.assertEqual(
                                    expected_value,
                                    actual_row["data"][column_name],
                                    msg="expected: {} != actual {}".format(
                                        expected_row, actual_row))

                # Verify all data is correct for the log replication part if sent
                if records_by_stream[stream]['messages'][-1].get("data"):
                    for column_name, expected_value in expected_messages[-1][
                            "data"].items():
                        self.assertEqual(
                            expected_value,
                            records_by_stream[stream]['messages'][-1]["data"]
                            [column_name],
                            msg="expected: {} != actual {}".format(
                                expected_row, actual_row))

                print("records are correct for stream {}".format(stream))

                # verify state and bookmarks
                state = menagerie.get_state(conn_id)
                bookmark = state['bookmarks'][stream]

                self.assertIsNone(
                    state.get('currently_syncing'),
                    msg="expected state's currently_syncing to be None")
                self.assertIsNotNone(
                    bookmark.get('current_log_version'),
                    msg=
                    "expected bookmark to have current_log_version because we are using log replication"
                )
                self.assertTrue(bookmark['initial_full_table_complete'],
                                msg="expected full table to be complete")
                inital_log_version = bookmark['current_log_version']

                self.assertEqual(
                    bookmark['version'],
                    table_version[stream],
                    msg="expected bookmark for stream to match version")

                expected_schemas = self.expected_metadata()[stream]['schema']
                self.assertEqual(
                    records_by_stream[stream]['schema'],
                    simplejson.loads(simplejson.dumps(expected_schemas),
                                     use_decimal=True),
                    msg="expected: {} != actual: {}".format(
                        expected_schemas, records_by_stream[stream]['schema']))

        # ----------------------------------------------------------------------
        # invoke the sync job AGAIN and after insert, update, delete or rows
        # ----------------------------------------------------------------------

        database_name = "data_types_database"
        schema_name = "dbo"
        table_name = "decimal_precisions"
        precision_scale = DECIMAL_PRECISION_SCALE
        column_type = [
            "decimal({},{})".format(precision, scale)
            for precision, scale in precision_scale
        ]
        column_name = ["pk"] + [
            x.replace("(", "_").replace(",", "_").replace(")", "")
            for x in column_type
        ]
        insert_value = [
            (7, Decimal('-92473.8401'), Decimal('-4182159664734.645653'),
             Decimal('6101329656084900380190.268036'),
             Decimal('4778017533841887320066645.9761464001349')),
        ]
        update_value = [
            (3, Decimal('-92473.8401'), Decimal('-4182159664734.645653'),
             Decimal('6101329656084900380190.268036'),
             Decimal('4778017533841887320066645.9761464001349')),
        ]
        delete_value = [(4, )]
        query_list = (insert(database_name, schema_name, table_name,
                             insert_value))
        query_list.extend(
            delete_by_pk(database_name, schema_name, table_name, delete_value,
                         column_name[:1]))
        query_list.extend(
            update_by_pk(database_name, schema_name, table_name, update_value,
                         column_name))
        mssql_cursor_context_manager(*query_list)
        insert_value = [insert_value[0] + (None, )]
        update_value = [update_value[0] + (None, )]
        delete_value = [
            delete_value[0] + (None, None, None, None, datetime.utcnow())
        ]
        self.EXPECTED_METADATA["data_types_database_dbo_decimal_precisions"]["values"] = \
            [self.expected_metadata()["data_types_database_dbo_decimal_precisions"]["values"][-1]] + \
            insert_value + delete_value + update_value
        self.EXPECTED_METADATA["data_types_database_dbo_decimal_precisions"][
            "fields"].append({
                "_sdc_deleted_at": {
                    'sql-datatype': 'datetime',
                    'selected-by-default': True,
                    'inclusion': 'automatic'
                }
            })

        database_name = "data_types_database"
        schema_name = "dbo"
        table_name = "numeric_precisions"
        precision_scale = NUMERIC_PRECISION_SCALE
        column_type = [
            "numeric({},{})".format(precision, scale)
            for precision, scale in precision_scale
        ]
        column_name = ["pk"] + [
            x.replace("(", "_").replace(",", "_").replace(")", "")
            for x in column_type
        ]
        insert_value = [
            (7, Decimal('96701.9382'), Decimal('-4371716.186100650268'),
             Decimal('-367352.306093776232045517794'),
             Decimal('-81147872128956247517327931319278572.985')),
        ]
        update_value = [
            (3, Decimal('96701.9382'), Decimal('-4371716.186100650268'),
             Decimal('-367352.306093776232045517794'),
             Decimal('-81147872128956247517327931319278572.985')),
        ]
        delete_value = [(4, )]
        query_list = (insert(database_name, schema_name, table_name,
                             insert_value))
        query_list.extend(
            delete_by_pk(database_name, schema_name, table_name, delete_value,
                         column_name[:1]))
        query_list.extend(
            update_by_pk(database_name, schema_name, table_name, update_value,
                         column_name))
        mssql_cursor_context_manager(*query_list)
        insert_value = [insert_value[0] + (None, )]
        update_value = [update_value[0] + (None, )]
        delete_value = [
            delete_value[0] + (None, None, None, None, datetime.utcnow())
        ]
        self.EXPECTED_METADATA["data_types_database_dbo_numeric_precisions"]["values"] = \
            insert_value + delete_value + update_value
        self.EXPECTED_METADATA["data_types_database_dbo_numeric_precisions"][
            "fields"].append({
                "_sdc_deleted_at": {
                    'sql-datatype': 'datetime',
                    'selected-by-default': True,
                    'inclusion': 'automatic'
                }
            })

        sync_job_name = runner.run_sync_mode(self, conn_id)

        # verify tap and target exit codes
        exit_status = menagerie.get_exit_status(conn_id, sync_job_name)
        menagerie.verify_sync_exit_status(self, exit_status, sync_job_name)
        record_count_by_stream = runner.examine_target_output_file(
            self, conn_id, self.expected_streams(),
            self.expected_primary_keys_by_stream_id())
        expected_count = {
            k: len(v['values'])
            for k, v in self.expected_metadata().items()
        }
        self.assertEqual(record_count_by_stream, expected_count)
        records_by_stream = runner.get_records_from_target_output()

        for stream in self.expected_streams():
            with self.subTest(stream=stream):
                stream_expected_data = self.expected_metadata()[stream]
                new_table_version = records_by_stream[stream]['table_version']

                # verify on a subsequent sync you get activate version message only after all data
                self.assertEqual(
                    records_by_stream[stream]['messages'][0]['action'],
                    'activate_version')
                self.assertTrue(
                    all([
                        message["action"] == "upsert" for message in
                        records_by_stream[stream]['messages'][1:]
                    ]))

                column_names = [
                    list(field_data.keys())[0]
                    for field_data in stream_expected_data[self.FIELDS]
                ]

                expected_messages = [{
                    "action": "upsert",
                    "data": {
                        column: value
                        for column, value in list(
                            zip(column_names, stream_expected_data[self.VALUES]
                                [row]))
                    }
                } for row in range(len(stream_expected_data[self.VALUES]))]

                # remove sequences from actual values for comparison
                [
                    message.pop("sequence")
                    for message in records_by_stream[stream]['messages'][1:]
                ]

                # Verify all data is correct
                for expected_row, actual_row in list(
                        zip(expected_messages,
                            records_by_stream[stream]['messages'][1:])):
                    with self.subTest(expected_row=expected_row):
                        self.assertEqual(actual_row["action"], "upsert")

                        # we only send the _sdc_deleted_at column for deleted rows
                        self.assertGreaterEqual(
                            len(expected_row["data"].keys()),
                            len(actual_row["data"].keys()),
                            msg="there are not the same number of columns")

                        for column_name, expected_value in expected_row[
                                "data"].items():
                            if column_name != "_sdc_deleted_at":
                                if isinstance(expected_value, Decimal):
                                    self.assertEqual(
                                        type(actual_row["data"][column_name]),
                                        Decimal,
                                        msg=
                                        "decimal value is not represented as a number"
                                    )
                                    self.assertEqual(
                                        expected_value,
                                        actual_row["data"][column_name],
                                        msg="expected: {} != actual {}".format(
                                            expected_row, actual_row))
                                else:
                                    self.assertEqual(
                                        expected_value,
                                        actual_row["data"][column_name],
                                        msg="expected: {} != actual {}".format(
                                            expected_row, actual_row))
                            elif expected_value:
                                # we have an expected value for a deleted row
                                try:
                                    actual_value = datetime.strptime(
                                        actual_row["data"][column_name],
                                        "%Y-%m-%dT%H:%M:%S.%fZ")
                                except ValueError:
                                    actual_value = datetime.strptime(
                                        actual_row["data"][column_name],
                                        "%Y-%m-%dT%H:%M:%SZ")
                                self.assertGreaterEqual(
                                    actual_value,
                                    expected_value - timedelta(seconds=15))
                                self.assertLessEqual(
                                    actual_value,
                                    expected_value + timedelta(seconds=15))
                            else:
                                # the row wasn't deleted so we can either not pass the column or it can be None
                                self.assertIsNone(
                                    actual_row["data"].get(column_name))

                print("records are correct for stream {}".format(stream))

                # verify state and bookmarks
                state = menagerie.get_state(conn_id)
                bookmark = state['bookmarks'][stream]

                self.assertIsNone(
                    state.get('currently_syncing'),
                    msg="expected state's currently_syncing to be None")
                self.assertIsNotNone(
                    bookmark.get('current_log_version'),
                    msg=
                    "expected bookmark to have current_log_version because we are using log replication"
                )
                self.assertTrue(bookmark['initial_full_table_complete'],
                                msg="expected full table to be complete")
                new_log_version = bookmark['current_log_version']
                self.assertGreater(new_log_version,
                                   inital_log_version,
                                   msg='expected log version to increase')

                self.assertEqual(
                    bookmark['version'],
                    table_version[stream],
                    msg="expected bookmark for stream to match version")
                self.assertEqual(
                    bookmark['version'],
                    new_table_version,
                    msg="expected bookmark for stream to match version")

                expected_schemas = self.expected_metadata()[stream]['schema']
                self.assertEqual(
                    records_by_stream[stream]['schema'],
                    simplejson.loads(simplejson.dumps(expected_schemas),
                                     use_decimal=True),
                    msg="expected: {} != actual: {}".format(
                        expected_schemas, records_by_stream[stream]['schema']))
Exemplo n.º 5
0
    def test_run(self):
        """stream_expected_data[self.VALUES]
        Verify that a full sync can send capture all data and send it in the correct format
        for integer and boolean (bit) data.
        Verify that the fist sync sends an activate immediately.
        Verify that the table version is incremented up
        """
        print("running test {}".format(self.name()))

        conn_id = self.create_connection()

        # run in check mode
        check_job_name = runner.run_check_mode(self, conn_id)

        # verify check  exit codes
        exit_status = menagerie.get_exit_status(conn_id, check_job_name)
        menagerie.verify_check_exit_status(self, exit_status, check_job_name)

        # get the catalog information of discovery
        found_catalogs = menagerie.get_catalogs(conn_id)
        additional_md = [{
            "breadcrumb": [],
            "metadata": {
                'replication-method': 'INCREMENTAL',
                'replication-key': 'replication_key_column'
            }
        }]
        BaseTapTest.select_all_streams_and_fields(conn_id,
                                                  found_catalogs,
                                                  additional_md=additional_md)

        # clear state
        menagerie.set_state(conn_id, {})
        sync_job_name = runner.run_sync_mode(self, conn_id)

        # verify tap and target exit codes
        exit_status = menagerie.get_exit_status(conn_id, sync_job_name)
        menagerie.verify_sync_exit_status(self, exit_status, sync_job_name)

        # verify record counts of streams
        record_count_by_stream = runner.examine_target_output_file(
            self, conn_id, self.expected_streams(),
            self.expected_primary_keys_by_stream_id())
        expected_count = {
            k: len(v['values'])
            for k, v in self.expected_metadata().items()
        }
        self.assertEqual(record_count_by_stream, expected_count)

        # verify records match on the first sync
        records_by_stream = runner.get_records_from_target_output()

        table_version = dict()
        for stream in self.expected_streams():
            with self.subTest(stream=stream):
                stream_expected_data = self.expected_metadata()[stream]
                table_version[stream] = records_by_stream[stream][
                    'table_version']

                # verify on the first sync you get
                # activate version message before and after all data for the full table
                # and before the logical replication part
                self.assertEqual(
                    records_by_stream[stream]['messages'][0]['action'],
                    'activate_version')

                self.assertTrue(
                    all([
                        m["action"] == "upsert"
                        for m in records_by_stream[stream]['messages'][1:-1]
                    ]),
                    msg="Expect all but the first message to be upserts")
                self.assertEqual(
                    len(stream_expected_data[self.VALUES]),
                    len(records_by_stream[stream]['messages'][1:-1]),
                    msg="incorrect number of upserts")

                column_names = [
                    list(field_data.keys())[0]
                    for field_data in stream_expected_data[self.FIELDS]
                ]

                expected_messages = [{
                    "action": "upsert",
                    "data": {
                        column: value
                        for column, value in list(zip(column_names,
                                                      row_values))
                    }
                } for row_values in sorted(stream_expected_data[self.VALUES],
                                           key=lambda row:
                                           (row[1] is not None, row[1]))]

                # Verify all data is correct for incremental
                for expected_row, actual_row in list(
                        zip(expected_messages,
                            records_by_stream[stream]['messages'][1:])):
                    with self.subTest(expected_row=expected_row):
                        self.assertEqual(actual_row["action"], "upsert")
                        self.assertEqual(
                            len(expected_row["data"].keys()),
                            len(actual_row["data"].keys()),
                            msg="there are not the same number of columns")
                        for column_name, expected_value in expected_row[
                                "data"].items():
                            column_index = [
                                list(key.keys())[0] for key in
                                self.expected_metadata()[stream][self.FIELDS]
                            ].index(column_name)
                            if self.expected_metadata()[stream][self.FIELDS][column_index][column_name][self.DATATYPE] \
                                    in ("real", "float") \
                                    and actual_row["data"][column_name] is not None:
                                self.assertEqual(
                                    type(actual_row["data"][column_name]),
                                    Decimal,
                                    msg=
                                    "float value is not represented as a number"
                                )
                                self.assertEqual(
                                    float(str(float32(expected_value))),
                                    float(
                                        str(
                                            float32(actual_row["data"]
                                                    [column_name]))),
                                    msg=
                                    "single value of {} doesn't match actual {}"
                                    .format(
                                        float(str(float32(expected_value))),
                                        float(
                                            str(
                                                float32(actual_row["data"]
                                                        [column_name])))))
                            else:
                                self.assertEqual(
                                    expected_value,
                                    actual_row["data"][column_name],
                                    msg="expected: {} != actual {}".format(
                                        expected_row, actual_row))
                print("records are correct for stream {}".format(stream))

                # verify state and bookmarks
                state = menagerie.get_state(conn_id)
                bookmark = state['bookmarks'][stream]

                self.assertIsNone(
                    state.get('currently_syncing'),
                    msg="expected state's currently_syncing to be None")
                self.assertIsNone(bookmark.get('current_log_version'),
                                  msg="no log_version for incremental")
                self.assertIsNone(bookmark.get('initial_full_table_complete'),
                                  msg="no full table for incremental")
                # find the max value of the replication key
                self.assertEqual(
                    bookmark['replication_key_value'],
                    max([
                        row[1] for row in stream_expected_data[self.VALUES]
                        if row[1] is not None
                    ]))
                # self.assertEqual(bookmark['replication_key'], 'replication_key_value')

                self.assertEqual(
                    bookmark['version'],
                    table_version[stream],
                    msg="expected bookmark for stream to match version")

                expected_schemas = self.expected_metadata()[stream]['schema']
                self.assertEqual(records_by_stream[stream]['schema'],
                                 expected_schemas,
                                 msg="expected: {} != actual: {}".format(
                                     expected_schemas,
                                     records_by_stream[stream]['schema']))

        # ----------------------------------------------------------------------
        # invoke the sync job AGAIN and after insert, update, delete or rows
        # ----------------------------------------------------------------------

        database_name = "data_types_database"
        schema_name = "dbo"
        table_name = "float_precisions"
        column_name = [
            "pk", "replication_key_column", "float_53", "real_24_bits"
        ]
        insert_value = [(15, 100, 100, 100),
                        (14, 3.4028235e+38, 1.7976931348623157e+308,
                         3.4028235e+38)]
        update_value = [(4, 101, 101, 101),
                        (6, 3.4028233e+38, 1.7976931348623157e+308,
                         3.4028235e+38)]
        delete_value = [(5, )]
        query_list = (insert(database_name, schema_name, table_name,
                             insert_value))
        query_list.extend(
            delete_by_pk(database_name, schema_name, table_name, delete_value,
                         column_name[:1]))
        query_list.extend(
            update_by_pk(database_name, schema_name, table_name, update_value,
                         column_name))
        mssql_cursor_context_manager(*query_list)
        insert_value = insert_value[-1:]  # only repl_key >= gets included
        update_value = update_value[-1:]
        self.EXPECTED_METADATA["data_types_database_dbo_float_precisions"]["values"] = \
            [(1, 3.4028230e+38, 1.7976931348623157e+308, 3.4028235e+38)] + update_value + insert_value

        sync_job_name = runner.run_sync_mode(self, conn_id)

        # verify tap and target exit codes
        exit_status = menagerie.get_exit_status(conn_id, sync_job_name)
        menagerie.verify_sync_exit_status(self, exit_status, sync_job_name)
        record_count_by_stream = runner.examine_target_output_file(
            self, conn_id, self.expected_streams(),
            self.expected_primary_keys_by_stream_id())
        expected_count = {
            k: len(v['values'])
            for k, v in self.expected_metadata().items()
        }
        self.assertEqual(record_count_by_stream, expected_count)
        records_by_stream = runner.get_records_from_target_output()

        for stream in self.expected_streams():
            with self.subTest(stream=stream):
                stream_expected_data = self.expected_metadata()[stream]
                new_table_version = records_by_stream[stream]['table_version']

                # verify on a subsequent sync you get activate version message only after all data
                self.assertEqual(
                    records_by_stream[stream]['messages'][0]['action'],
                    'activate_version')
                self.assertEqual(
                    records_by_stream[stream]['messages'][-1]['action'],
                    'activate_version')
                self.assertTrue(
                    all([
                        message["action"] == "upsert" for message in
                        records_by_stream[stream]['messages'][1:-1]
                    ]))

                column_names = [
                    list(field_data.keys())[0]
                    for field_data in stream_expected_data[self.FIELDS]
                ]

                expected_messages = [{
                    "action": "upsert",
                    "data": {
                        column: value
                        for column, value in list(zip(column_names,
                                                      row_values))
                    }
                } for row_values in sorted(stream_expected_data[self.VALUES],
                                           key=lambda row:
                                           (row[1] is not None, row[1]))]

                # remove sequences from actual values for comparison
                [
                    message.pop("sequence")
                    for message in records_by_stream[stream]['messages'][1:-1]
                ]

                # Verify all data is correct
                for expected_row, actual_row in list(
                        zip(expected_messages,
                            records_by_stream[stream]['messages'][1:-1])):
                    with self.subTest(expected_row=expected_row):
                        self.assertEqual(actual_row["action"], "upsert")

                        # we only send the _sdc_deleted_at column for deleted rows
                        self.assertEqual(
                            len(expected_row["data"].keys()),
                            len(actual_row["data"].keys()),
                            msg="there are not the same number of columns")
                        for column_name, expected_value in expected_row[
                                "data"].items():
                            column_index = [
                                list(key.keys())[0] for key in
                                self.expected_metadata()[stream][self.FIELDS]
                            ].index(column_name)
                            if self.expected_metadata()[stream][self.FIELDS][column_index][column_name][self.DATATYPE] \
                                    in ("real", "float") \
                                    and actual_row["data"][column_name] is not None:
                                self.assertEqual(
                                    type(actual_row["data"][column_name]),
                                    Decimal,
                                    msg=
                                    "float value is not represented as a number"
                                )
                                self.assertEqual(
                                    float(str(float32(expected_value))),
                                    float(
                                        str(
                                            float32(actual_row["data"]
                                                    [column_name]))),
                                    msg=
                                    "single value of {} doesn't match actual {}"
                                    .format(
                                        float(str(float32(expected_value))),
                                        float(
                                            str(
                                                float32(actual_row["data"]
                                                        [column_name])))))
                            else:
                                self.assertEqual(
                                    expected_value,
                                    actual_row["data"][column_name],
                                    msg="expected: {} != actual {}".format(
                                        expected_row, actual_row))

                print("records are correct for stream {}".format(stream))

                # verify state and bookmarks
                state = menagerie.get_state(conn_id)
                bookmark = state['bookmarks'][stream]

                self.assertIsNone(
                    state.get('currently_syncing'),
                    msg="expected state's currently_syncing to be None")
                self.assertIsNone(bookmark.get('current_log_version'),
                                  msg="no log_version for incremental")
                self.assertIsNone(bookmark.get('initial_full_table_complete'),
                                  msg="no full table for incremental")
                # find the max value of the replication key
                self.assertEqual(
                    bookmark['replication_key_value'],
                    max([
                        row[1] for row in stream_expected_data[self.VALUES]
                        if row[1] is not None
                    ]))
                # self.assertEqual(bookmark['replication_key'], 'replication_key_value')

                self.assertEqual(
                    bookmark['version'],
                    table_version[stream],
                    msg="expected bookmark for stream to match version")
                self.assertEqual(
                    bookmark['version'],
                    new_table_version,
                    msg="expected bookmark for stream to match version")

                state = menagerie.get_state(conn_id)
                bookmark = state['bookmarks'][stream]

                expected_schemas = self.expected_metadata()[stream]['schema']
                self.assertEqual(records_by_stream[stream]['schema'],
                                 expected_schemas,
                                 msg="expected: {} != actual: {}".format(
                                     expected_schemas,
                                     records_by_stream[stream]['schema']))
    def test_run(self):
        """
        Verify that for each stream you can get multiple pages of data
        when no fields are selected and only the automatic fields are replicated.

        PREREQUISITE
        For EACH stream add enough data that you surpass the limit of a single
        fetch of data.  For instance if you have a limit of 250 records ensure
        that 251 (or more) records have been posted for that stream.
        """
        print("\n\nRUNNING {}\n\n".format(self.name()))

        # Resetting tracked parent objects prior to test
        utils.reset_tracked_parent_objects()

        # ensure data exists for sync streams and set expectations
        expected_records = {x: []
                            for x in self.expected_sync_streams()
                            }  # ids by stream
        for stream in self.testable_streams():
            since = None
            if stream in self.expected_incremental_streams():
                since = dt.strptime(self.get_properties()['start_date'],
                                    self.START_DATE_FORMAT).strftime(
                                        self.TEST_TIME_FORMAT)
            _, existing_objects = utils.get_total_record_count_and_objects(
                stream, since=since)
            if existing_objects:
                logging.info("Data exists for stream: {}".format(stream))
                for obj in existing_objects:
                    expected_records[stream].append({
                        field: obj.get(field)
                        for field in self.expected_automatic_fields().get(
                            stream)
                    })
                continue

            logging.info("Data does not exist for stream: {}".format(stream))

            new_object = utils.create_object(stream)
            logging.info("Data generated for stream: {}".format(stream))
            expected_records[stream].append({
                field: new_object.get(field)
                for field in self.expected_automatic_fields().get(stream)
            })

        conn_id = connections.ensure_connection(self)

        #run in check mode
        check_job_name = runner.run_check_mode(self, conn_id)

        #verify check  exit codes
        exit_status = menagerie.get_exit_status(conn_id, check_job_name)
        menagerie.verify_check_exit_status(self, exit_status, check_job_name)

        found_catalogs = menagerie.get_catalogs(conn_id)
        self.assertGreater(
            len(found_catalogs),
            0,
            msg="unable to locate schemas for connection {}".format(conn_id))

        found_catalog_names = set(
            map(lambda c: c['tap_stream_id'], found_catalogs))
        diff = self.expected_check_streams().symmetric_difference(
            found_catalog_names)
        self.assertEqual(
            len(diff),
            0,
            msg="discovered schemas do not match: {}".format(diff))
        print("discovered schemas are OK")

        # Select all streams but only automtic fields
        self.select_all_streams_and_fields(conn_id,
                                           found_catalogs,
                                           select_all_fields=False)

        for cat in found_catalogs:
            catalog_entry = menagerie.get_annotated_schema(
                conn_id, cat['stream_id'])
            for k in self.expected_automatic_fields()[cat['stream_name']]:
                mdata = next(
                    (m for m in catalog_entry['metadata']
                     if len(m['breadcrumb']) == 2 and m['breadcrumb'][1] == k),
                    None)
                print("Validating inclusion on {}: {}".format(
                    cat['stream_name'], mdata))
                self.assertTrue(
                    mdata and mdata['metadata']['inclusion'] == 'automatic')

        catalogs = menagerie.get_catalogs(conn_id)

        #clear state
        menagerie.set_state(conn_id, {})

        # run sync
        sync_job_name = runner.run_sync_mode(self, conn_id)

        # Verify tap exit codes
        exit_status = menagerie.get_exit_status(conn_id, sync_job_name)
        menagerie.verify_sync_exit_status(self, exit_status, sync_job_name)

        # read target output
        first_record_count_by_stream = runner.examine_target_output_file(
            self, conn_id, self.expected_sync_streams(), self.expected_pks())
        replicated_row_count = reduce(lambda accum, c: accum + c,
                                      first_record_count_by_stream.values())
        synced_records = runner.get_records_from_target_output()

        # Verify target has records for all synced streams
        for stream, count in first_record_count_by_stream.items():
            assert stream in self.expected_sync_streams()
            self.assertGreater(
                count,
                0,
                msg="failed to replicate any data for: {}".format(stream))
        print("total replicated row count: {}".format(replicated_row_count))

        for stream in self.testable_streams():
            with self.subTest(stream=stream):
                data = synced_records.get(stream)
                record_messages_keys = [
                    set(row['data'].keys()) for row in data['messages']
                ]
                expected_keys = self.expected_automatic_fields().get(stream)

                # Verify that ONLY automatic fields are emitted
                for actual_keys in record_messages_keys:
                    self.assertEqual(
                        actual_keys.symmetric_difference(expected_keys),
                        set(),
                        msg="Expected automatic fields and nothing else.")

                actual_records = [row['data'] for row in data['messages']]

                # Verify the number of records match expectations
                self.assertEqual(len(expected_records.get(stream)),
                                 len(actual_records),
                                 msg="Number of actual records do match expectations. " +\
                                 "We probably have duplicate records.")

                # verify by values, that we replicated the expected records
                for actual_record in actual_records:
                    self.assertTrue(
                        actual_record in expected_records.get(stream),
                        msg="Actual record missing from expectations")
                for expected_record in expected_records.get(stream):
                    self.assertTrue(expected_record in actual_records,
                                    msg="Expected record missing from target.")

        # CLEAN UP
        stream_to_delete = 'boards'
        boards_remaining = 5
        print("Deleting all but {} records for stream {}.".format(
            boards_remaining, stream_to_delete))
        board_count = len(expected_records.get(stream_to_delete, []))
        for obj_to_delete in expected_records.get(
                stream_to_delete, []):  # Delete all baords between syncs
            if board_count > boards_remaining:
                utils.delete_object(stream_to_delete, obj_to_delete.get('id'))
                board_count -= 1
            else:
                break

        # Reset the parent objects that we have been tracking
        utils.reset_tracked_parent_objects()
    def test_run(self):
        conn_id = connections.ensure_connection(self)

        #run in check mode
        check_job_name = runner.run_check_mode(self, conn_id)

        #verify check  exit codes
        exit_status = menagerie.get_exit_status(conn_id, check_job_name)
        menagerie.verify_check_exit_status(self, exit_status, check_job_name)

        found_catalogs = menagerie.get_catalogs(conn_id)
        self.assertGreater(
            len(found_catalogs),
            0,
            msg="unable to locate schemas for connection {}".format(conn_id))

        found_catalog_names = set(
            map(lambda c: c['tap_stream_id'], found_catalogs))

        diff = self.expected_check_streams().symmetric_difference(
            found_catalog_names)
        self.assertEqual(
            len(diff),
            0,
            msg="discovered schemas do not match: {}".format(diff))
        print("discovered schemas are kosher")

        #select all catalogs
        for catalog in found_catalogs:
            connections.select_catalog_and_fields_via_metadata(
                conn_id, catalog,
                menagerie.get_annotated_schema(conn_id, catalog['stream_id']))

        future_time = "2050-01-01T00:00:00.000000Z"

        #clear state
        future_bookmarks = {
            "currently_syncing": None,
            "bookmarks": {
                "contacts": {
                    "offset": {},
                    "versionTimestamp": future_time
                },
                "subscription_changes": {
                    "startTimestamp": future_time,
                    "offset": {}
                },
                "campaigns": {
                    "offset": {}
                },
                "forms": {
                    "updatedAt": future_time
                },
                "deals": {
                    "offset": {},
                    "hs_lastmodifieddate": future_time
                },
                "workflows": {
                    "updatedAt": future_time
                },
                "owners": {
                    "updatedAt": future_time
                },
                "contact_lists": {
                    "updatedAt": future_time,
                    "offset": {}
                },
                "email_events": {
                    "startTimestamp": future_time,
                    "offset": {}
                },
                "companies": {
                    "offset": {},
                    "hs_lastmodifieddate": future_time
                },
                "engagements": {
                    "lastUpdated": future_time,
                    "offset": {}
                }
            }
        }

        menagerie.set_state(conn_id, future_bookmarks)

        sync_job_name = runner.run_sync_mode(self, conn_id)

        #verify tap and target exit codes
        exit_status = menagerie.get_exit_status(conn_id, sync_job_name)
        menagerie.verify_sync_exit_status(self, exit_status, sync_job_name)

        record_count_by_stream = runner.examine_target_output_file(
            self, conn_id, self.expected_sync_streams(), self.expected_pks())

        #because the bookmarks were set into the future, we should NOT actually replicate any data.
        #minus campaigns, and deal_pipelines because those endpoints do NOT suppport bookmarks
        streams_with_bookmarks = self.expected_sync_streams()
        streams_with_bookmarks.remove('campaigns')
        streams_with_bookmarks.remove('deal_pipelines')
        bad_streams = streams_with_bookmarks.intersection(
            record_count_by_stream.keys())
        self.assertEqual(
            len(bad_streams),
            0,
            msg="still pulled down records from {} despite future bookmarks".
            format(bad_streams))

        state = menagerie.get_state(conn_id)

        # NB: Companies and engagements won't set a bookmark in the future.
        state["bookmarks"].pop("companies")
        state["bookmarks"].pop("engagements")
        future_bookmarks["bookmarks"].pop("companies")
        future_bookmarks["bookmarks"].pop("engagements")

        self.assertEqual(
            state,
            future_bookmarks,
            msg=
            "state should not have been modified because we didn't replicate any data"
        )
        bookmarks = state.get('bookmarks')
        bookmark_streams = set(state.get('bookmarks').keys())
Exemplo n.º 8
0
    def test_run(self):
        conn_id = connections.ensure_connection(self)

        # run in check mode
        check_job_name = runner.run_check_mode(self, conn_id)

        # verify check  exit codes
        exit_status = menagerie.get_exit_status(conn_id, check_job_name)
        menagerie.verify_check_exit_status(self, exit_status, check_job_name)

        # verify discovery produced (at least) 1 expected catalog
        found_catalogs = [
            found_catalog for found_catalog in menagerie.get_catalogs(conn_id)
            if found_catalog['tap_stream_id'] in self.expected_check_streams()
        ]
        self.assertGreaterEqual(len(found_catalogs), 1)

        # verify the tap discovered the expected streams
        found_catalog_names = {
            catalog['tap_stream_id']
            for catalog in found_catalogs
        }
        self.assertSetEqual(self.expected_check_streams(), found_catalog_names)

        # verify that persisted streams have the correct properties
        test_catalog = found_catalogs[0]
        self.assertEqual(test_table_name, test_catalog['stream_name'])
        print("discovered streams are correct")

        # perform table selection
        print('selecting {} and all fields within the table'.format(
            test_table_name))
        schema_and_metadata = menagerie.get_annotated_schema(
            conn_id, test_catalog['stream_id'])
        additional_md = [{
            "breadcrumb": [],
            "metadata": {
                'replication-method': 'FULL_TABLE'
            }
        }]
        _ = connections.select_catalog_and_fields_via_metadata(
            conn_id, test_catalog, schema_and_metadata, additional_md)

        # clear state
        menagerie.set_state(conn_id, {})

        # run sync job 1 and verify exit codes
        sync_job_name = runner.run_sync_mode(self, conn_id)
        exit_status = menagerie.get_exit_status(conn_id, sync_job_name)
        menagerie.verify_sync_exit_status(self, exit_status, sync_job_name)

        # get records
        record_count_by_stream = runner.examine_target_output_file(
            self, conn_id, self.expected_sync_streams(), self.expected_pks())
        records_by_stream = runner.get_records_from_target_output()
        table_version_1 = records_by_stream[test_table_name]['table_version']
        messages = records_by_stream[test_table_name]['messages']

        # verify the execpted number of records were replicated
        self.assertEqual(3, record_count_by_stream[test_table_name])

        # verify the message actions match expectations
        self.assertEqual(5, len(messages))
        self.assertEqual('activate_version', messages[0]['action'])
        self.assertEqual('upsert', messages[1]['action'])
        self.assertEqual('upsert', messages[2]['action'])
        self.assertEqual('upsert', messages[3]['action'])
        self.assertEqual('activate_version', messages[4]['action'])

        # verify the persisted schema matches expectations
        self.assertEqual(expected_schemas[test_table_name],
                         records_by_stream[test_table_name]['schema'])

        # verify replicated records match expectations
        self.assertDictEqual(self.expected_records[0], messages[1]['data'])
        self.assertDictEqual(self.expected_records[1], messages[2]['data'])
        self.assertDictEqual(self.expected_records[2], messages[3]['data'])

        print("records are correct")

        # grab bookmarked state
        state = menagerie.get_state(conn_id)
        bookmark = state['bookmarks'][
            'dev-public-postgres_full_table_replication_test']

        # verify state and bookmarks meet expectations
        self.assertIsNone(state['currently_syncing'])
        self.assertIsNone(bookmark.get('lsn'))
        self.assertIsNone(bookmark.get('replication_key'))
        self.assertIsNone(bookmark.get('replication_key_value'))
        self.assertEqual(table_version_1, bookmark['version'])

        #----------------------------------------------------------------------
        # invoke the sync job AGAIN and get the same 3 records
        #----------------------------------------------------------------------

        # run sync job 2 and verify exit codes
        sync_job_name = runner.run_sync_mode(self, conn_id)
        exit_status = menagerie.get_exit_status(conn_id, sync_job_name)
        menagerie.verify_sync_exit_status(self, exit_status, sync_job_name)

        # get records
        record_count_by_stream = runner.examine_target_output_file(
            self, conn_id, self.expected_sync_streams(), self.expected_pks())
        records_by_stream = runner.get_records_from_target_output()
        table_version_2 = records_by_stream[test_table_name]['table_version']
        messages = records_by_stream[test_table_name]['messages']

        # verify the execpted number of records were replicated
        self.assertEqual(3, record_count_by_stream[test_table_name])

        # verify the message actions match expectations
        self.assertEqual(4, len(messages))
        self.assertEqual('upsert', messages[0]['action'])
        self.assertEqual('upsert', messages[1]['action'])
        self.assertEqual('upsert', messages[2]['action'])
        self.assertEqual('activate_version', messages[3]['action'])

        # verify the new table version increased on the second sync
        self.assertGreater(table_version_2, table_version_1)

        # verify the persisted schema still matches expectations
        self.assertEqual(expected_schemas[test_table_name],
                         records_by_stream[test_table_name]['schema'])

        # verify replicated records still match expectations
        self.assertDictEqual(self.expected_records[0], messages[0]['data'])
        self.assertDictEqual(self.expected_records[1], messages[1]['data'])
        self.assertDictEqual(self.expected_records[2], messages[2]['data'])

        # grab bookmarked state
        state = menagerie.get_state(conn_id)
        bookmark = state['bookmarks'][
            'dev-public-postgres_full_table_replication_test']

        # verify state and bookmarks meet expectations
        self.assertIsNone(state['currently_syncing'])
        self.assertIsNone(bookmark.get('lsn'))
        self.assertIsNone(bookmark.get('replication_key'))
        self.assertIsNone(bookmark.get('replication_key_value'))
        self.assertEqual(table_version_2, bookmark['version'])

        #----------------------------------------------------------------------
        # invoke the sync job AGAIN following various manipulations to the data
        #----------------------------------------------------------------------

        with db_utils.get_test_connection('dev') as conn:
            conn.autocommit = True
            with conn.cursor(cursor_factory=psycopg2.extras.DictCursor) as cur:

                # NB | We will perform the following actions prior to the next sync:
                #      [Action (EXPECTED RESULT)]

                #      Insert a record
                #      Insert a record to be updated prior to sync
                #      Insert a record to be deleted prior to sync (NOT REPLICATED)

                #      Update an existing record
                #      Update a newly inserted record

                #      Delete an existing record
                #      Delete  a newly inserted record

                # inserting...
                # a new record
                nyc_tz = pytz.timezone('America/New_York')
                our_time_offset = "-04:00"
                our_ts = datetime.datetime(1996, 4, 4, 4, 4, 4, 733184)
                our_ts_tz = nyc_tz.localize(our_ts)
                our_time = datetime.time(6, 6, 6)
                our_time_tz = our_time.isoformat() + our_time_offset
                our_date = datetime.date(1970, 7, 1)
                my_uuid = str(uuid.uuid1())
                self.inserted_records.append({
                    'our_varchar':
                    "our_varchar 2",
                    'our_varchar_10':
                    "varchar_10",
                    'our_text':
                    "some text 2",
                    'our_integer':
                    44101,
                    'our_smallint':
                    2,
                    'our_bigint':
                    1000001,
                    'our_decimal':
                    decimal.Decimal('9876543210.02'),
                    quote_ident('OUR TS', cur):
                    our_ts,
                    quote_ident('OUR TS TZ', cur):
                    our_ts_tz,
                    quote_ident('OUR TIME', cur):
                    our_time,
                    quote_ident('OUR TIME TZ', cur):
                    our_time_tz,
                    quote_ident('OUR DATE', cur):
                    our_date,
                    'our_double':
                    decimal.Decimal('1.1'),
                    'our_real':
                    decimal.Decimal('1.2'),
                    'our_boolean':
                    True,
                    'our_bit':
                    '1',
                    'our_json':
                    json.dumps({'nymn': 77}),
                    'our_jsonb':
                    json.dumps({'burgers': 'good++'}),
                    'our_uuid':
                    my_uuid,
                    'our_citext':
                    'cyclops 2',
                    'our_store':
                    'dances=>"floor",name=>"betty"',
                    'our_cidr':
                    '192.168.101.128/25',
                    'our_inet':
                    '192.168.101.128/24',
                    'our_mac':
                    '08:00:2b:01:02:04',
                    'our_money':
                    '$0.98789'
                })
                self.expected_records.append({
                    'id':
                    4,
                    'our_varchar':
                    "our_varchar 2",
                    'our_varchar_10':
                    "varchar_10",
                    'our_text':
                    "some text 2",
                    'our_integer':
                    44101,
                    'our_smallint':
                    2,
                    'our_bigint':
                    1000001,
                    'our_decimal':
                    decimal.Decimal('9876543210.02'),
                    'OUR TS':
                    self.expected_ts(our_ts),
                    'OUR TS TZ':
                    self.expected_ts_tz(our_ts_tz),
                    'OUR TIME':
                    str(our_time),
                    'OUR TIME TZ':
                    str(our_time_tz),
                    'OUR DATE':
                    '1970-07-01T00:00:00+00:00',
                    'our_double':
                    decimal.Decimal('1.1'),
                    'our_real':
                    decimal.Decimal('1.2'),
                    'our_boolean':
                    True,
                    'our_bit':
                    True,
                    'our_json':
                    '{"nymn": 77}',
                    'our_jsonb':
                    '{"burgers": "good++"}',
                    'our_uuid':
                    self.inserted_records[-1]['our_uuid'],
                    'our_citext':
                    self.inserted_records[-1]['our_citext'],
                    'our_store': {
                        "name": "betty",
                        "dances": "floor"
                    },
                    'our_cidr':
                    self.inserted_records[-1]['our_cidr'],
                    'our_inet':
                    self.inserted_records[-1]['our_inet'],
                    'our_mac':
                    self.inserted_records[-1]['our_mac'],
                    'our_money':
                    '$0.99',
                    'our_alignment_enum':
                    None,
                })
                # a new record which we will then update prior to sync
                our_ts = datetime.datetime(2007, 1, 1, 12, 12, 12, 222111)
                nyc_tz = pytz.timezone('America/New_York')
                our_ts_tz = nyc_tz.localize(our_ts)
                our_time = datetime.time(12, 11, 10)
                our_time_tz = our_time.isoformat() + "-04:00"
                our_date = datetime.date(1999, 9, 9)
                my_uuid = str(uuid.uuid1())
                self.inserted_records.append({
                    'our_varchar':
                    "our_varchar 4",
                    'our_varchar_10':
                    "varchar_3",
                    'our_text':
                    "some text 4",
                    'our_integer':
                    55200,
                    'our_smallint':
                    1,
                    'our_bigint':
                    100000,
                    'our_decimal':
                    decimal.Decimal('1234567899.99'),
                    quote_ident('OUR TS', cur):
                    our_ts,
                    quote_ident('OUR TS TZ', cur):
                    our_ts_tz,
                    quote_ident('OUR TIME', cur):
                    our_time,
                    quote_ident('OUR TIME TZ', cur):
                    our_time_tz,
                    quote_ident('OUR DATE', cur):
                    our_date,
                    'our_double':
                    decimal.Decimal('1.1'),
                    'our_real':
                    decimal.Decimal('1.2'),
                    'our_boolean':
                    True,
                    'our_bit':
                    '0',
                    'our_json':
                    json.dumps('some string'),
                    'our_jsonb':
                    json.dumps(['burgers are good']),
                    'our_uuid':
                    my_uuid,
                    'our_store':
                    'size=>"small",name=>"betty"',
                    'our_citext':
                    'cyclops 3',
                    'our_cidr':
                    '192.168.101.128/25',
                    'our_inet':
                    '192.168.101.128/24',
                    'our_mac':
                    '08:00:2b:01:02:04',
                    'our_money':
                    None,
                })
                self.expected_records.append({
                    'our_decimal':
                    decimal.Decimal('1234567899.99'),
                    'our_text':
                    'some text 4',
                    'our_bit':
                    False,
                    'our_integer':
                    55200,
                    'our_double':
                    decimal.Decimal('1.1'),
                    'id':
                    5,
                    'our_json':
                    self.inserted_records[-1]['our_json'],
                    'our_boolean':
                    True,
                    'our_jsonb':
                    self.inserted_records[-1]['our_jsonb'],
                    'our_bigint':
                    100000,
                    'OUR TS':
                    self.expected_ts(our_ts),
                    'OUR TS TZ':
                    self.expected_ts_tz(our_ts_tz),
                    'OUR TIME':
                    str(our_time),
                    'OUR TIME TZ':
                    str(our_time_tz),
                    'our_store': {
                        "name": "betty",
                        "size": "small"
                    },
                    'our_smallint':
                    1,
                    'OUR DATE':
                    '1999-09-09T00:00:00+00:00',
                    'our_varchar':
                    'our_varchar 4',
                    'our_uuid':
                    self.inserted_records[-1]['our_uuid'],
                    'our_real':
                    decimal.Decimal('1.2'),
                    'our_varchar_10':
                    'varchar_3',
                    'our_citext':
                    'cyclops 3',
                    'our_cidr':
                    '192.168.101.128/25',
                    'our_inet':
                    '192.168.101.128/24',
                    'our_mac':
                    '08:00:2b:01:02:04',
                    'our_money':
                    None,
                    'our_alignment_enum':
                    None,
                })
                # a new record to be deleted prior to sync
                our_ts = datetime.datetime(2111, 1, 1, 12, 12, 12, 222111)
                nyc_tz = pytz.timezone('America/New_York')
                our_ts_tz = nyc_tz.localize(our_ts)
                our_time = datetime.time(12, 11, 10)
                our_time_tz = our_time.isoformat() + "-04:00"
                our_date = datetime.date(1999, 9, 9)
                my_uuid = str(uuid.uuid1())
                self.inserted_records.append({
                    'our_varchar':
                    "our_varchar 4",
                    'our_varchar_10':
                    "varchar_3",
                    'our_text':
                    "some text 4",
                    'our_integer':
                    55200,
                    'our_smallint':
                    1,
                    'our_bigint':
                    100000,
                    'our_decimal':
                    decimal.Decimal('1234567899.99'),
                    quote_ident('OUR TS', cur):
                    our_ts,
                    quote_ident('OUR TS TZ', cur):
                    our_ts_tz,
                    quote_ident('OUR TIME', cur):
                    our_time,
                    quote_ident('OUR TIME TZ', cur):
                    our_time_tz,
                    quote_ident('OUR DATE', cur):
                    our_date,
                    'our_double':
                    decimal.Decimal('1.1'),
                    'our_real':
                    decimal.Decimal('1.2'),
                    'our_boolean':
                    True,
                    'our_bit':
                    '0',
                    'our_json':
                    json.dumps('some string'),
                    'our_jsonb':
                    json.dumps(['burgers are good']),
                    'our_uuid':
                    my_uuid,
                    'our_store':
                    'size=>"small",name=>"betty"',
                    'our_citext':
                    'cyclops 3',
                    'our_cidr':
                    '192.168.101.128/25',
                    'our_inet':
                    '192.168.101.128/24',
                    'our_mac':
                    '08:00:2b:01:02:04',
                    'our_money':
                    None,
                })
                self.expected_records.append({
                    'our_decimal':
                    decimal.Decimal('1234567899.99'),
                    'our_text':
                    'some text 4',
                    'our_bit':
                    False,
                    'our_integer':
                    55200,
                    'our_double':
                    decimal.Decimal('1.1'),
                    'id':
                    6,
                    'our_json':
                    self.inserted_records[-1]['our_json'],
                    'our_boolean':
                    True,
                    'our_jsonb':
                    self.inserted_records[-1]['our_jsonb'],
                    'our_bigint':
                    100000,
                    'OUR TS':
                    self.expected_ts(our_ts),
                    'OUR TS TZ':
                    self.expected_ts_tz(our_ts_tz),
                    'OUR TIME':
                    str(our_time),
                    'OUR TIME TZ':
                    str(our_time_tz),
                    'our_store': {
                        "name": "betty",
                        "size": "small"
                    },
                    'our_smallint':
                    1,
                    'OUR DATE':
                    '1999-09-09T00:00:00+00:00',
                    'our_varchar':
                    'our_varchar 4',
                    'our_uuid':
                    self.inserted_records[-1]['our_uuid'],
                    'our_real':
                    decimal.Decimal('1.2'),
                    'our_varchar_10':
                    'varchar_3',
                    'our_citext':
                    'cyclops 3',
                    'our_cidr':
                    '192.168.101.128/25',
                    'our_inet':
                    '192.168.101.128/24',
                    'our_mac':
                    '08:00:2b:01:02:04',
                    'our_money':
                    None,
                    'our_alignment_enum':
                    None,
                })

                db_utils.insert_record(cur, test_table_name,
                                       self.inserted_records[3])
                db_utils.insert_record(cur, test_table_name,
                                       self.inserted_records[4])
                db_utils.insert_record(cur, test_table_name,
                                       self.inserted_records[5])

                # updating ...
                # an existing record
                canon_table_name = db_utils.canonicalized_table_name(
                    cur, test_schema_name, test_table_name)
                record_pk = 1
                our_ts = datetime.datetime(2021, 4, 4, 4, 4, 4, 733184)
                our_ts_tz = nyc_tz.localize(our_ts)
                updated_data = {
                    "OUR TS TZ": our_ts_tz,
                    "our_double": decimal.Decimal("6.6"),
                    "our_money": "$0.00"
                }
                self.expected_records[0]["OUR TS TZ"] = self.expected_ts_tz(
                    our_ts_tz)
                self.expected_records[0]["our_double"] = decimal.Decimal("6.6")
                self.expected_records[0]["our_money"] = "$0.00"

                db_utils.update_record(cur, canon_table_name, record_pk,
                                       updated_data)

                # a newly inserted record
                canon_table_name = db_utils.canonicalized_table_name(
                    cur, test_schema_name, test_table_name)
                record_pk = 5
                our_ts = datetime.datetime(2021, 4, 4, 4, 4, 4, 733184)
                our_ts_tz = nyc_tz.localize(our_ts)
                updated_data = {
                    "OUR TS TZ": our_ts_tz,
                    "our_double": decimal.Decimal("6.6"),
                    "our_money": "$0.00"
                }
                self.expected_records[4]["OUR TS TZ"] = self.expected_ts_tz(
                    our_ts_tz)
                self.expected_records[4]["our_double"] = decimal.Decimal("6.6")
                self.expected_records[4]["our_money"] = "$0.00"

                db_utils.update_record(cur, canon_table_name, record_pk,
                                       updated_data)

                # deleting
                # an existing record
                record_pk = 2
                db_utils.delete_record(cur, canon_table_name, record_pk)

                # a newly inserted record
                record_pk = 6
                db_utils.delete_record(cur, canon_table_name, record_pk)

        #----------------------------------------------------------------------
        # invoke the sync job AGAIN after vairous manipulations
        #----------------------------------------------------------------------

        # run sync job 3 and verify exit codes
        sync_job_name = runner.run_sync_mode(self, conn_id)
        exit_status = menagerie.get_exit_status(conn_id, sync_job_name)
        menagerie.verify_sync_exit_status(self, exit_status, sync_job_name)

        # get records
        record_count_by_stream = runner.examine_target_output_file(
            self, conn_id, self.expected_sync_streams(), self.expected_pks())
        records_by_stream = runner.get_records_from_target_output()
        table_version_3 = records_by_stream[test_table_name]['table_version']
        messages = records_by_stream[test_table_name]['messages']

        # verify the execpted number of records were replicated
        self.assertEqual(4, record_count_by_stream[test_table_name])

        # verify the message actions match expectations
        self.assertEqual(5, len(messages))
        self.assertEqual('upsert', messages[0]['action'])
        self.assertEqual('upsert', messages[1]['action'])
        self.assertEqual('upsert', messages[2]['action'])
        self.assertEqual('upsert', messages[3]['action'])
        self.assertEqual('activate_version', messages[4]['action'])

        # verify the new table version increased on the second sync
        self.assertGreater(table_version_3, table_version_2)

        # verify the persisted schema still matches expectations
        self.assertEqual(expected_schemas[test_table_name],
                         records_by_stream[test_table_name]['schema'])

        # NB | This is a little tough to track mentally so here's a breakdown of
        #      the order of operations by expected records indexes:

        #      Prior to Sync 1
        #        insert 0, 1, 2

        #      Prior to Sync 2
        #        No db changes

        #      Prior to Sync 3
        #        insert 3, 4, 5
        #        update 0, 4
        #        delete 1, 5

        #      Resulting Synced Records: 2, 3, 0, 4

        # verify replicated records still match expectations
        self.assertDictEqual(self.expected_records[2],
                             messages[0]['data'])  # existing insert
        self.assertDictEqual(self.expected_records[3],
                             messages[1]['data'])  # new insert
        self.assertDictEqual(self.expected_records[0],
                             messages[2]['data'])  # existing update
        self.assertDictEqual(self.expected_records[4],
                             messages[3]['data'])  # new insert / update

        # grab bookmarked state
        state = menagerie.get_state(conn_id)
        bookmark = state['bookmarks'][
            'dev-public-postgres_full_table_replication_test']

        # verify state and bookmarks meet expectations
        self.assertIsNone(state['currently_syncing'])
        self.assertIsNone(bookmark.get('lsn'))
        self.assertIsNone(bookmark.get('replication_key'))
        self.assertIsNone(bookmark.get('replication_key_value'))
        self.assertEqual(table_version_3, bookmark['version'])
Exemplo n.º 9
0
    def test_run(self):

        conn_id = connections.ensure_connection(self, payload_hook=None)

        # Run the tap in check mode
        check_job_name = runner.run_check_mode(self, conn_id)

        # Verify the check's exit status
        exit_status = menagerie.get_exit_status(conn_id, check_job_name)
        menagerie.verify_check_exit_status(self, exit_status, check_job_name)

        # Verify that there are catalogs found
        found_catalogs = menagerie.get_catalogs(conn_id)
        self.assertGreater(
            len(found_catalogs),
            0,
            msg="unable to locate schemas for connection {}".format(conn_id))

        found_catalog_names = set(
            map(lambda c: c['tap_stream_id'], found_catalogs))
        subset = self.expected_check_streams().issubset(found_catalog_names)
        self.assertTrue(
            subset,
            msg=
            "Expected check streams are not subset of discovered catalog, extra streams={}"
            .format(
                self.expected_check_streams().difference(found_catalog_names)))
        #
        # # Select some catalogs
        our_catalogs = [
            c for c in found_catalogs
            if c.get('tap_stream_id') in self.expected_sync_streams()
        ]
        for catalog in our_catalogs:
            schema = menagerie.get_annotated_schema(conn_id,
                                                    catalog['stream_id'])
            connections.select_catalog_and_fields_via_metadata(
                conn_id, catalog, schema, [], [])

        # # Verify that all streams sync at least one row for initial sync
        # # This test is also verifying access token expiration handling. If test fails with
        # # authentication error, refresh token was not replaced after expiring.
        menagerie.set_state(conn_id, {})
        sync_job_name = runner.run_sync_mode(self, conn_id)

        # # Verify tap and target exit codes
        exit_status = menagerie.get_exit_status(conn_id, sync_job_name)
        menagerie.verify_sync_exit_status(self, exit_status, sync_job_name)
        record_count_by_stream = runner.examine_target_output_file(
            self, conn_id, self.expected_sync_streams(), self.expected_pks())

        # Verify that all streams sync at least one row for initial sync
        for stream in self.expected_sync_streams().difference({
                'feature_events',
                'events',
                'page_events',
                'guide_events',
                'poll_events',
                'track_events',
                'track_types',
        }):
            with self.subTest(stream=stream):
                self.assertLess(0, record_count_by_stream[stream])

        # TODO run the remaining assertions against all incremental streams

        # Verify that bookmark values are correct after incremental sync
        start_date = self.get_properties()['start_date']
        current_state = menagerie.get_state(conn_id)
        test_bookmark = current_state['bookmarks']['accounts']

        # Verify a bookmark is present for accounts
        self.assertIn('bookmarks', current_state.keys())
        self.assertIn('accounts', current_state['bookmarks'].keys())
Exemplo n.º 10
0
    def do_test(self, conn_id):
        # Select our catalogs
        our_catalogs = [
            c for c in self.found_catalogs
            if c.get('tap_stream_id') in self.expected_sync_streams()
        ]
        for c in our_catalogs:
            c_annotated = menagerie.get_annotated_schema(
                conn_id, c['stream_id'])
            c_metadata = metadata.to_map(c_annotated['metadata'])
            connections.select_catalog_and_fields_via_metadata(
                conn_id, c, c_annotated, [], [])

        # Clear state before our run
        menagerie.set_state(conn_id, {})

        # Run a sync job using orchestrator
        sync_job_name = runner.run_sync_mode(self, conn_id)

        # Verify tap and target exit codes
        exit_status = menagerie.get_exit_status(conn_id, sync_job_name)
        menagerie.verify_sync_exit_status(self, exit_status, sync_job_name)

        # Verify actual rows were synced
        record_count_by_stream = runner.examine_target_output_file(
            self, conn_id, self.expected_sync_streams(), self.expected_pks())
        replicated_row_count = reduce(lambda accum, c: accum + c,
                                      record_count_by_stream.values())
        self.assertGreater(replicated_row_count,
                           0,
                           msg="failed to replicate any data: {}".format(
                               record_count_by_stream))
        print("total replicated row count: {}".format(replicated_row_count))

        # Ensure all records have a value for PK(s)
        records = runner.get_records_from_target_output()
        for stream in self.expected_sync_streams():
            messages = records.get(stream, {}).get('messages', [])
            for m in messages:
                pk_set = self.expected_pks()[stream]
                for pk in pk_set:
                    self.assertIsNotNone(m.get('data', {}).get(pk),
                                         msg="oh no! {}".format(m))

        satisfaction_ratings_bookmark = "2020-03-05T14:14:42Z"

        state = menagerie.get_state(conn_id)
        state['bookmarks']['satisfaction_ratings'][
            'updated_at'] = satisfaction_ratings_bookmark
        menagerie.set_state(conn_id, state)

        # Create a new record
        creds = {
            "email": "*****@*****.**",
            "subdomain": self.get_properties()['subdomain'],
            "token": os.getenv('TAP_ZENDESK_API_TOKEN')
        }

        self.client = Zenpy(**creds)

        # Create some new objects
        group_name = str(uuid.uuid4())
        group = Group(name=group_name)
        self.created_group = self.client.groups.create(group)

        org_name = str(uuid.uuid4())
        org = Organization(name=org_name)
        self.created_org = self.client.organizations.create(org)

        user = User(name="John Doe",
                    email="{}@mailinator.com".format(uuid.uuid4()))
        self.created_user = self.client.users.create(user)

        # Sleeping 1 minute to validate lookback behavior needed in tap
        # We've observed a delay between when users are created and when
        # they're available through the API
        print("sleeping for 60 seconds")
        time.sleep(60)

        # Run another Sync
        sync_job_name = runner.run_sync_mode(self, conn_id)
        exit_status = menagerie.get_exit_status(conn_id, sync_job_name)
        menagerie.verify_sync_exit_status(self, exit_status, sync_job_name)

        # Check both sets of records and make sure we have our new rows
        records = runner.get_records_from_target_output()
        messages = records.get('groups', {}).get('messages', [])
        new_record = [
            r for r in messages if r['data']['id'] == self.created_group.id
        ]
        self.assertTrue(any(new_record))
        self.assertEqual(len(messages),
                         2,
                         msg="Sync'd incorrect count of messages: {}".format(
                             len(messages)))

        messages = records.get('organizations', {}).get('messages', [])

        new_record = [
            r for r in messages if r['data']['id'] == self.created_org.id
        ]
        self.assertTrue(any(new_record))
        self.assertEqual(len(messages),
                         2,
                         msg="Sync'd incorrect count of messages: {}".format(
                             len(messages)))

        messages = records.get('users', {}).get('messages', [])
        new_record = [
            r for r in messages if r['data']['id'] == self.created_user.id
        ]
        self.assertTrue(any(new_record))
        # NB: GreaterEqual because we suspect Zendesk updates users in the backend
        # >= 1 because we're no longer inclusive of the last replicated user record. The lookback will control this going forward.
        # If we get the user we wanted and then some, this assertion should succeed
        self.assertGreaterEqual(
            len(messages),
            1,
            msg="Sync'd incorrect count of messages: {}".format(len(messages)))

        messages = records.get('satisfaction_ratings', {}).get('messages', [])
        new_record = [
            r for r in messages
            if r['data']['id'] in [364471784994, 364465631433, 364465212373]
        ]
        self.assertTrue(any(new_record))
        self.assertGreaterEqual(
            len(messages),
            3,
            msg="Sync'd incorrect count of messages: {}".format(len(messages)))
        for message in messages:
            self.assertGreaterEqual(
                utils.strptime_to_utc(
                    message.get('data', {}).get('updated_at', '')),
                utils.strptime_to_utc(satisfaction_ratings_bookmark))
Exemplo n.º 11
0
    def test_run(self):
        conn_id = connections.ensure_connection(self)

        # run in check mode and verify exit codes
        check_job_name = runner.run_check_mode(self, conn_id)
        exit_status = menagerie.get_exit_status(conn_id, check_job_name)
        menagerie.verify_check_exit_status(self, exit_status, check_job_name)

        # verify basics of discovery are consistent with expectations...

        # verify discovery produced (at least) 1 expected catalog
        found_catalogs = [
            found_catalog for found_catalog in menagerie.get_catalogs(conn_id)
            if found_catalog['tap_stream_id'] in self.expected_check_streams()
        ]
        self.assertGreaterEqual(len(found_catalogs), 1)

        # verify the tap discovered the expected streams
        found_catalog_names = {
            catalog['tap_stream_id']
            for catalog in found_catalogs
        }
        self.assertSetEqual(self.expected_check_streams(), found_catalog_names)

        # verify that persisted streams have the correct properties
        test_catalog = found_catalogs[0]
        self.assertEqual(test_table_name, test_catalog['stream_name'])
        print("discovered streams are correct")

        # perform table selection
        print('selecting {} and all fields within the table'.format(
            test_table_name))
        schema_and_metadata = menagerie.get_annotated_schema(
            conn_id, test_catalog['stream_id'])
        additional_md = [{
            "breadcrumb": [],
            "metadata": {
                'replication-method': 'INCREMENTAL',
                'replication-key': 'OUR TS TZ'
            }
        }]
        _ = connections.select_catalog_and_fields_via_metadata(
            conn_id, test_catalog, schema_and_metadata, additional_md)

        # clear state
        menagerie.set_state(conn_id, {})

        # run sync job 1 and verify exit codes
        sync_job_name = runner.run_sync_mode(self, conn_id)
        exit_status = menagerie.get_exit_status(conn_id, sync_job_name)
        menagerie.verify_sync_exit_status(self, exit_status, sync_job_name)

        # get records
        record_count_by_stream = runner.examine_target_output_file(
            self, conn_id, self.expected_sync_streams(),
            self.expected_primary_keys())
        records_by_stream = runner.get_records_from_target_output()
        table_version = records_by_stream[test_table_name]['table_version']
        messages = records_by_stream[test_table_name]['messages']

        # verify the execpted number of records were replicated
        self.assertEqual(3, record_count_by_stream[test_table_name])

        # verify the message actions match expectations
        self.assertEqual(4, len(messages))
        self.assertEqual('activate_version', messages[0]['action'])
        self.assertEqual('upsert', messages[1]['action'])
        self.assertEqual('upsert', messages[2]['action'])
        self.assertEqual('upsert', messages[3]['action'])

        # verify the persisted schema matches expectations
        self.assertEqual(expected_schemas[test_table_name],
                         records_by_stream[test_table_name]['schema'])

        # verify replicated records match expectations
        self.assertDictEqual(self.expected_records[0], messages[1]['data'])
        self.assertDictEqual(self.expected_records[1], messages[2]['data'])
        self.assertDictEqual(self.expected_records[2], messages[3]['data'])

        # verify records are in ascending order by replication-key value
        expected_replication_key = list(
            self.expected_replication_keys()[test_table_name])[0]
        self.assertLess(messages[1]['data'][expected_replication_key],
                        messages[2]['data'][expected_replication_key])
        self.assertLess(messages[2]['data'][expected_replication_key],
                        messages[3]['data'][expected_replication_key])

        print("records are correct")

        # grab bookmarked state
        state = menagerie.get_state(conn_id)
        bookmark = state['bookmarks'][
            'dev-public-postgres_incremental_replication_test']

        # verify state and bookmarks meet expectations
        self.assertIsNone(state['currently_syncing'])
        self.assertIsNone(bookmark.get('lsn'))
        self.assertEqual(table_version, bookmark['version'])
        self.assertEqual(expected_replication_key, bookmark['replication_key'])
        self.assertEqual(self.expected_records[2][expected_replication_key],
                         bookmark['replication_key_value'])

        #----------------------------------------------------------------------
        # invoke the sync job AGAIN following various manipulations to the data
        #----------------------------------------------------------------------

        with db_utils.get_test_connection('dev') as conn:
            conn.autocommit = True
            with conn.cursor(cursor_factory=psycopg2.extras.DictCursor) as cur:

                # NB | We will perform the following actions prior to the next sync:
                #      [Action (EXPECTED RESULT)]

                #      Insert a record with a lower replication-key value (NOT REPLICATED)
                #      Insert a record with a higher replication-key value (REPLICATED)

                #      Insert a record with a higher replication-key value and...
                #      Delete it (NOT REPLICATED)

                #      Update a record with a higher replication-key value (REPLICATED)
                #      Update a record with a lower replication-key value (NOT REPLICATED)

                # inserting...
                # a record with a replication-key value that is lower than the previous bookmark
                nyc_tz = pytz.timezone('America/New_York')
                our_time_offset = "-04:00"
                our_ts = datetime.datetime(1996, 4, 4, 4, 4, 4, 733184)
                our_ts_tz = nyc_tz.localize(our_ts)
                our_time = datetime.time(6, 6, 6)
                our_time_tz = our_time.isoformat() + our_time_offset
                our_date = datetime.date(1970, 7, 1)
                my_uuid = str(uuid.uuid1())
                self.inserted_records.append({
                    'our_varchar':
                    "our_varchar 2",
                    'our_varchar_10':
                    "varchar_10",
                    'our_text':
                    "some text 2",
                    'our_integer':
                    44101,
                    'our_smallint':
                    2,
                    'our_bigint':
                    1000001,
                    'our_decimal':
                    decimal.Decimal('9876543210.02'),
                    quote_ident('OUR TS', cur):
                    our_ts,
                    quote_ident('OUR TS TZ', cur):
                    our_ts_tz,
                    quote_ident('OUR TIME', cur):
                    our_time,
                    quote_ident('OUR TIME TZ', cur):
                    our_time_tz,
                    quote_ident('OUR DATE', cur):
                    our_date,
                    'our_double':
                    decimal.Decimal('1.1'),
                    'our_real':
                    decimal.Decimal('1.2'),
                    'our_boolean':
                    True,
                    'our_bit':
                    '1',
                    'our_json':
                    json.dumps({'nymn': 77}),
                    'our_jsonb':
                    json.dumps({'burgers': 'good++'}),
                    'our_uuid':
                    my_uuid,
                    'our_citext':
                    'cyclops 2',
                    'our_store':
                    'dances=>"floor",name=>"betty"',
                    'our_cidr':
                    '192.168.101.128/25',
                    'our_inet':
                    '192.168.101.128/24',
                    'our_mac':
                    '08:00:2b:01:02:04',
                    'our_money':
                    '$0.98789'
                })
                self.expected_records.append({
                    'id':
                    4,
                    'our_varchar':
                    "our_varchar 2",
                    'our_varchar_10':
                    "varchar_10",
                    'our_text':
                    "some text 2",
                    'our_integer':
                    44101,
                    'our_smallint':
                    2,
                    'our_bigint':
                    1000001,
                    'our_decimal':
                    decimal.Decimal('9876543210.02'),
                    'OUR TS':
                    self.expected_ts(our_ts),
                    'OUR TS TZ':
                    self.expected_ts_tz(our_ts_tz),
                    'OUR TIME':
                    str(our_time),
                    'OUR TIME TZ':
                    str(our_time_tz),
                    'OUR DATE':
                    '1970-07-01T00:00:00+00:00',
                    'our_double':
                    decimal.Decimal('1.1'),
                    'our_real':
                    decimal.Decimal('1.2'),
                    'our_boolean':
                    True,
                    'our_bit':
                    True,
                    'our_json':
                    '{"nymn": 77}',
                    'our_jsonb':
                    '{"burgers": "good++"}',
                    'our_uuid':
                    self.inserted_records[-1]['our_uuid'],
                    'our_citext':
                    self.inserted_records[-1]['our_citext'],
                    'our_store': {
                        "name": "betty",
                        "dances": "floor"
                    },
                    'our_cidr':
                    self.inserted_records[-1]['our_cidr'],
                    'our_inet':
                    self.inserted_records[-1]['our_inet'],
                    'our_mac':
                    self.inserted_records[-1]['our_mac'],
                    'our_money':
                    '$0.99'
                })
                # a record with a replication-key value that is higher than the previous bookmark
                our_ts = datetime.datetime(2007, 1, 1, 12, 12, 12, 222111)
                nyc_tz = pytz.timezone('America/New_York')
                our_ts_tz = nyc_tz.localize(our_ts)
                our_time = datetime.time(12, 11, 10)
                our_time_tz = our_time.isoformat() + "-04:00"
                our_date = datetime.date(1999, 9, 9)
                my_uuid = str(uuid.uuid1())
                self.inserted_records.append({
                    'our_varchar':
                    "our_varchar 4",
                    'our_varchar_10':
                    "varchar_3",
                    'our_text':
                    "some text 4",
                    'our_integer':
                    55200,
                    'our_smallint':
                    1,
                    'our_bigint':
                    100000,
                    'our_decimal':
                    decimal.Decimal('1234567899.99'),
                    quote_ident('OUR TS', cur):
                    our_ts,
                    quote_ident('OUR TS TZ', cur):
                    our_ts_tz,
                    quote_ident('OUR TIME', cur):
                    our_time,
                    quote_ident('OUR TIME TZ', cur):
                    our_time_tz,
                    quote_ident('OUR DATE', cur):
                    our_date,
                    'our_double':
                    decimal.Decimal('1.1'),
                    'our_real':
                    decimal.Decimal('1.2'),
                    'our_boolean':
                    True,
                    'our_bit':
                    '0',
                    'our_json':
                    json.dumps('some string'),
                    'our_jsonb':
                    json.dumps(['burgers are good']),
                    'our_uuid':
                    my_uuid,
                    'our_store':
                    'size=>"small",name=>"betty"',
                    'our_citext':
                    'cyclops 3',
                    'our_cidr':
                    '192.168.101.128/25',
                    'our_inet':
                    '192.168.101.128/24',
                    'our_mac':
                    '08:00:2b:01:02:04',
                    'our_money':
                    None,
                })
                self.expected_records.append({
                    'our_decimal':
                    decimal.Decimal('1234567899.99'),
                    'our_text':
                    'some text 4',
                    'our_bit':
                    False,
                    'our_integer':
                    55200,
                    'our_double':
                    decimal.Decimal('1.1'),
                    'id':
                    5,
                    'our_json':
                    self.inserted_records[-1]['our_json'],
                    'our_boolean':
                    True,
                    'our_jsonb':
                    self.inserted_records[-1]['our_jsonb'],
                    'our_bigint':
                    100000,
                    'OUR TS':
                    self.expected_ts(our_ts),
                    'OUR TS TZ':
                    self.expected_ts_tz(our_ts_tz),
                    'OUR TIME':
                    str(our_time),
                    'OUR TIME TZ':
                    str(our_time_tz),
                    'our_store': {
                        "name": "betty",
                        "size": "small"
                    },
                    'our_smallint':
                    1,
                    'OUR DATE':
                    '1999-09-09T00:00:00+00:00',
                    'our_varchar':
                    'our_varchar 4',
                    'our_uuid':
                    self.inserted_records[-1]['our_uuid'],
                    'our_real':
                    decimal.Decimal('1.2'),
                    'our_varchar_10':
                    'varchar_3',
                    'our_citext':
                    'cyclops 3',
                    'our_cidr':
                    '192.168.101.128/25',
                    'our_inet':
                    '192.168.101.128/24',
                    'our_mac':
                    '08:00:2b:01:02:04',
                    'our_money':
                    None
                })
                # a record with a replication-key value that is higher than the previous bookmark (to be deleted)
                our_ts = datetime.datetime(2111, 1, 1, 12, 12, 12, 222111)
                nyc_tz = pytz.timezone('America/New_York')
                our_ts_tz = nyc_tz.localize(our_ts)
                our_time = datetime.time(12, 11, 10)
                our_time_tz = our_time.isoformat() + "-04:00"
                our_date = datetime.date(1999, 9, 9)
                my_uuid = str(uuid.uuid1())
                self.inserted_records.append({
                    'our_varchar':
                    "our_varchar 4",
                    'our_varchar_10':
                    "varchar_3",
                    'our_text':
                    "some text 4",
                    'our_integer':
                    55200,
                    'our_smallint':
                    1,
                    'our_bigint':
                    100000,
                    'our_decimal':
                    decimal.Decimal('1234567899.99'),
                    quote_ident('OUR TS', cur):
                    our_ts,
                    quote_ident('OUR TS TZ', cur):
                    our_ts_tz,
                    quote_ident('OUR TIME', cur):
                    our_time,
                    quote_ident('OUR TIME TZ', cur):
                    our_time_tz,
                    quote_ident('OUR DATE', cur):
                    our_date,
                    'our_double':
                    decimal.Decimal('1.1'),
                    'our_real':
                    decimal.Decimal('1.2'),
                    'our_boolean':
                    True,
                    'our_bit':
                    '0',
                    'our_json':
                    json.dumps('some string'),
                    'our_jsonb':
                    json.dumps(['burgers are good']),
                    'our_uuid':
                    my_uuid,
                    'our_store':
                    'size=>"small",name=>"betty"',
                    'our_citext':
                    'cyclops 3',
                    'our_cidr':
                    '192.168.101.128/25',
                    'our_inet':
                    '192.168.101.128/24',
                    'our_mac':
                    '08:00:2b:01:02:04',
                    'our_money':
                    None,
                })
                self.expected_records.append({
                    'our_decimal':
                    decimal.Decimal('1234567899.99'),
                    'our_text':
                    'some text 4',
                    'our_bit':
                    False,
                    'our_integer':
                    55200,
                    'our_double':
                    decimal.Decimal('1.1'),
                    'id':
                    6,
                    'our_json':
                    self.inserted_records[-1]['our_json'],
                    'our_boolean':
                    True,
                    'our_jsonb':
                    self.inserted_records[-1]['our_jsonb'],
                    'our_bigint':
                    100000,
                    'OUR TS':
                    self.expected_ts(our_ts),
                    'OUR TS TZ':
                    self.expected_ts_tz(our_ts_tz),
                    'OUR TIME':
                    str(our_time),
                    'OUR TIME TZ':
                    str(our_time_tz),
                    'our_store': {
                        "name": "betty",
                        "size": "small"
                    },
                    'our_smallint':
                    1,
                    'OUR DATE':
                    '1999-09-09T00:00:00+00:00',
                    'our_varchar':
                    'our_varchar 4',
                    'our_uuid':
                    self.inserted_records[-1]['our_uuid'],
                    'our_real':
                    decimal.Decimal('1.2'),
                    'our_varchar_10':
                    'varchar_3',
                    'our_citext':
                    'cyclops 3',
                    'our_cidr':
                    '192.168.101.128/25',
                    'our_inet':
                    '192.168.101.128/24',
                    'our_mac':
                    '08:00:2b:01:02:04',
                    'our_money':
                    None
                })

                db_utils.insert_record(cur, test_table_name,
                                       self.inserted_records[3])
                db_utils.insert_record(cur, test_table_name,
                                       self.inserted_records[4])
                db_utils.insert_record(cur, test_table_name,
                                       self.inserted_records[5])

                # update a record with a replication-key value that is higher than the previous bookmark
                canon_table_name = db_utils.canonicalized_table_name(
                    cur, test_schema_name, test_table_name)
                record_pk = 1
                our_ts = datetime.datetime(2021, 4, 4, 4, 4, 4, 733184)
                our_ts_tz = nyc_tz.localize(our_ts)
                updated_data = {
                    "OUR TS TZ": our_ts_tz,
                    "our_double": decimal.Decimal("6.6"),
                    "our_money": "$0.00"
                }
                self.expected_records[0]["OUR TS TZ"] = self.expected_ts_tz(
                    our_ts_tz)
                self.expected_records[0]["our_double"] = decimal.Decimal("6.6")
                self.expected_records[0]["our_money"] = "$0.00"
                db_utils.update_record(cur, canon_table_name, record_pk,
                                       updated_data)

                # update a record with a replication-key value that is lower than the previous bookmark
                canon_table_name = db_utils.canonicalized_table_name(
                    cur, test_schema_name, test_table_name)
                record_pk = 2
                our_ts = datetime.datetime(1990, 4, 4, 4, 4, 4, 733184)
                our_ts_tz = nyc_tz.localize(our_ts)
                updated_data = {
                    "OUR TS TZ": our_ts_tz,
                    "our_double": decimal.Decimal("6.6"),
                    "our_money": "$0.00"
                }
                self.expected_records[1]["OUR TS TZ"] = self.expected_ts_tz(
                    our_ts_tz)
                self.expected_records[1]["our_double"] = decimal.Decimal("6.6")
                self.expected_records[1]["our_money"] = "$0.00"
                db_utils.update_record(cur, canon_table_name, record_pk,
                                       updated_data)

                # delete a newly inserted record with a higher replication key than the previous bookmark
                record_pk = 5
                db_utils.delete_record(cur, canon_table_name, record_pk)

        # run sync job 2 and verify exit codes
        sync_job_name = runner.run_sync_mode(self, conn_id)
        exit_status = menagerie.get_exit_status(conn_id, sync_job_name)
        menagerie.verify_sync_exit_status(self, exit_status, sync_job_name)

        # grab records
        record_count_by_stream = runner.examine_target_output_file(
            self, conn_id, self.expected_sync_streams(),
            self.expected_primary_keys())
        records_by_stream = runner.get_records_from_target_output()
        messages = records_by_stream[test_table_name]['messages']

        # verify the expected number of records were synced
        self.assertEqual(3, record_count_by_stream[test_table_name])

        # verify the message actions match expectations
        self.assertEqual('activate_version', messages[0]['action'])
        self.assertEqual('upsert', messages[1]['action'])
        self.assertEqual('upsert', messages[2]['action'])
        self.assertEqual('upsert', messages[3]['action'])

        # verify the persisted schema matches expectations
        self.assertEqual(expected_schemas[test_table_name],
                         records_by_stream[test_table_name]['schema'])

        # verify replicated records meet our expectations...

        # verify the first record was the bookmarked record from the previous sync
        self.assertDictEqual(self.expected_records[2], messages[1]['data'])

        # verify the expected updated record with a higher replication-key value was replicated
        self.assertDictEqual(self.expected_records[0], messages[2]['data'])

        # verify the expected inserted record with a lower replication-key value was NOT replicated
        actual_record_ids = [message['data']['id'] for message in messages[1:]]
        expected_record_id = self.expected_records[3]['id']
        self.assertNotIn(expected_record_id, actual_record_ids)

        # verify the deleted record with a lower replication-key value was NOT replicated
        expected_record_id = self.expected_records[4]['id']
        self.assertNotIn(expected_record_id, actual_record_ids)

        # verify the expected updated record with a lower replication-key value was NOT replicated
        expected_record_id = self.expected_records[1]['id']
        self.assertNotIn(expected_record_id, actual_record_ids)

        # verify the expected inserted record with a higher replication-key value was replicated
        self.assertDictEqual(self.expected_records[5], messages[3]['data'])

        # verify records are in ascending order by replication-key value
        self.assertLess(messages[1]['data'][expected_replication_key],
                        messages[2]['data'][expected_replication_key])
        self.assertLess(messages[2]['data'][expected_replication_key],
                        messages[3]['data'][expected_replication_key])

        print("records are correct")

        # get bookmarked state
        state = menagerie.get_state(conn_id)
        bookmark = state['bookmarks'][
            'dev-public-postgres_incremental_replication_test']

        # verify the bookmarked state matches our expectations
        self.assertIsNone(bookmark.get('lsn'))
        self.assertEqual(bookmark['version'], table_version)
        self.assertEqual(bookmark['replication_key'], expected_replication_key)
        self.assertEqual(bookmark['replication_key_value'],
                         self.expected_records[5][expected_replication_key])

        #---------------------------------------------------------------------
        # run sync AGAIN after deleting a record and get 1 record (prev bookmark)
        #----------------------------------------------------------------------

        # Delete a pre-existing record from the database
        with db_utils.get_test_connection('dev') as conn:
            conn.autocommit = True
            with conn.cursor(cursor_factory=psycopg2.extras.DictCursor) as cur:

                # delete a record with a lower replication key than the previous sync
                record_pk = 1
                db_utils.delete_record(cur, canon_table_name, record_pk)

        # run sync job 3 and verify exit codes
        sync_job_name = runner.run_sync_mode(self, conn_id)
        exit_status = menagerie.get_exit_status(conn_id, sync_job_name)
        menagerie.verify_sync_exit_status(self, exit_status, sync_job_name)

        # get records
        record_count_by_stream = runner.examine_target_output_file(
            self, conn_id, self.expected_sync_streams(),
            self.expected_primary_keys())
        records_by_stream = runner.get_records_from_target_output()
        messages = records_by_stream[test_table_name]['messages']

        # verify the expected number of records were replicated
        self.assertEqual(1, record_count_by_stream[test_table_name])

        # verify messages match our expectations
        self.assertEqual(2, len(messages))
        self.assertEqual(messages[0]['action'], 'activate_version')
        self.assertEqual(messages[1]['action'], 'upsert')
        self.assertEqual(records_by_stream[test_table_name]['table_version'],
                         table_version)

        # verify replicated records meet our expectations...

        # verify we did not re-replicate the deleted record
        actual_record_ids = [message['data']['id'] for message in messages[1:]]
        expected_record_id = self.expected_records[0]['id']
        self.assertNotIn(expected_record_id, actual_record_ids)

        # verify only the previously bookmarked record was synced
        self.assertDictEqual(self.expected_records[5], messages[1]['data'])

        print("records are correct")

        # get bookmarked state
        state = menagerie.get_state(conn_id)
        bookmark = state['bookmarks'][
            'dev-public-postgres_incremental_replication_test']

        # verify the bookmarked state matches our expectations
        self.assertIsNone(bookmark.get('lsn'))
        self.assertEqual(bookmark['version'], table_version)
        self.assertEqual(bookmark['replication_key'], expected_replication_key)
        self.assertEqual(bookmark['replication_key_value'],
                         self.expected_records[5][expected_replication_key])
Exemplo n.º 12
0
    def test_run(self):

        conn_id = connections.ensure_connection(self)

        #  -------------------------------
        # -----------  Discovery ----------
        #  -------------------------------

        # run in discovery mode
        check_job_name = runner.run_check_mode(self, conn_id)

        # verify check  exit codes
        exit_status = menagerie.get_exit_status(conn_id, check_job_name)
        menagerie.verify_check_exit_status(self, exit_status, check_job_name)

        # verify the tap discovered the right streams
        found_catalogs = menagerie.get_catalogs(conn_id)

        # assert we find the correct streams
        self.assertEqual(self.expected_check_streams(),
                         {c['tap_stream_id']
                          for c in found_catalogs})

        for tap_stream_id in self.expected_check_streams():
            found_stream = [
                c for c in found_catalogs
                if c['tap_stream_id'] == tap_stream_id
            ][0]

            # assert that the pks are correct
            self.assertEqual(
                self.expected_pks()[found_stream['stream_name']],
                set(
                    found_stream.get('metadata',
                                     {}).get('table-key-properties')))

            # assert that the row counts are correct
            self.assertEqual(
                self.expected_row_counts()[found_stream['stream_name']],
                found_stream.get('metadata', {}).get('row-count'))

        #  -----------------------------------
        # ----------- Initial Full Table ---------
        #  -----------------------------------
        # Select simple_coll_1 and simple_coll_2 streams and add replication method metadata
        for stream_catalog in found_catalogs:
            annotated_schema = menagerie.get_annotated_schema(
                conn_id, stream_catalog['stream_id'])
            additional_md = [{
                "breadcrumb": [],
                "metadata": {
                    'replication-method': 'LOG_BASED'
                }
            }]
            selected_metadata = connections.select_catalog_and_fields_via_metadata(
                conn_id, stream_catalog, annotated_schema, additional_md)

        # Run sync
        sync_job_name = runner.run_sync_mode(self, conn_id)

        exit_status = menagerie.get_exit_status(conn_id, sync_job_name)
        menagerie.verify_sync_exit_status(self, exit_status, sync_job_name)

        # verify the persisted schema was correct
        records_by_stream = runner.get_records_from_target_output()

        # assert that each of the streams that we synced are the ones that we expect to see
        record_count_by_stream = runner.examine_target_output_file(
            self, conn_id, self.expected_sync_streams(), self.expected_pks())

        # Verify that the full table was synced
        for tap_stream_id in self.expected_sync_streams():
            self.assertGreaterEqual(record_count_by_stream[tap_stream_id],
                                    self.expected_row_counts()[tap_stream_id])

        # Verify that we have 'initial_full_table_complete' bookmark
        state = menagerie.get_state(conn_id)
        first_versions = {}

        for tap_stream_id in self.expected_check_streams():
            # assert that the state has an initial_full_table_complete == True
            self.assertTrue(state['bookmarks'][tap_stream_id]
                            ['initial_full_table_complete'])
            # assert that there is a version bookmark in state
            first_versions[tap_stream_id] = state['bookmarks'][tap_stream_id][
                'version']
            self.assertIsNotNone(first_versions[tap_stream_id])
            # Verify that we have a oplog_ts_time and oplog_ts_inc bookmark
            self.assertIsNotNone(
                state['bookmarks'][tap_stream_id]['oplog_ts_time'])
            self.assertIsNotNone(
                state['bookmarks'][tap_stream_id]['oplog_ts_inc'])

        changed_ids = set()
        with get_test_connection() as client:
            # Delete two documents for each collection

            changed_ids.add(client['simple_db']['simple_coll_1'].find(
                {'int_field': 0})[0]['_id'])
            client["simple_db"]["simple_coll_1"].delete_one({'int_field': 0})

            changed_ids.add(client['simple_db']['simple_coll_1'].find(
                {'int_field': 1})[0]['_id'])
            client["simple_db"]["simple_coll_1"].delete_one({'int_field': 1})

            changed_ids.add(client['simple_db']['simple_coll_2'].find(
                {'int_field': 0})[0]['_id'])
            client["simple_db"]["simple_coll_2"].delete_one({'int_field': 0})

            changed_ids.add(client['simple_db']['simple_coll_2'].find(
                {'int_field': 1})[0]['_id'])
            client["simple_db"]["simple_coll_2"].delete_one({'int_field': 1})

            # Update two documents for each collection
            changed_ids.add(client['simple_db']['simple_coll_1'].find(
                {'int_field': 48})[0]['_id'])
            client["simple_db"]["simple_coll_1"].update_one(
                {'int_field': 48}, {'$set': {
                    'int_field': -1
                }})

            changed_ids.add(client['simple_db']['simple_coll_1'].find(
                {'int_field': 49})[0]['_id'])
            client["simple_db"]["simple_coll_1"].update_one(
                {'int_field': 49}, {'$set': {
                    'int_field': -1
                }})

            changed_ids.add(client['simple_db']['simple_coll_2'].find(
                {'int_field': 98})[0]['_id'])
            client["simple_db"]["simple_coll_2"].update_one(
                {'int_field': 98}, {'$set': {
                    'int_field': -1
                }})

            changed_ids.add(client['simple_db']['simple_coll_2'].find(
                {'int_field': 99})[0]['_id'])
            client["simple_db"]["simple_coll_2"].update_one(
                {'int_field': 99}, {'$set': {
                    'int_field': -1
                }})

            # Insert two documents for each collection
            client["simple_db"]["simple_coll_1"].insert_one({
                "int_field":
                50,
                "string_field":
                random_string_generator()
            })
            changed_ids.add(client['simple_db']['simple_coll_1'].find(
                {'int_field': 50})[0]['_id'])

            client["simple_db"]["simple_coll_1"].insert_one({
                "int_field":
                51,
                "string_field":
                random_string_generator()
            })
            changed_ids.add(client['simple_db']['simple_coll_1'].find(
                {'int_field': 51})[0]['_id'])

            client["simple_db"]["simple_coll_2"].insert_one({
                "int_field":
                100,
                "string_field":
                random_string_generator()
            })
            changed_ids.add(client['simple_db']['simple_coll_2'].find(
                {'int_field': 100})[0]['_id'])

            client["simple_db"]["simple_coll_2"].insert_one({
                "int_field":
                101,
                "string_field":
                random_string_generator()
            })
            changed_ids.add(client['simple_db']['simple_coll_2'].find(
                {'int_field': 101})[0]['_id'])

        #  -----------------------------------
        # ----------- Subsequent Oplog Sync ---------
        #  -----------------------------------

        # Run sync

        sync_job_name = runner.run_sync_mode(self, conn_id)

        exit_status = menagerie.get_exit_status(conn_id, sync_job_name)
        menagerie.verify_sync_exit_status(self, exit_status, sync_job_name)

        # verify the persisted schema was correct
        messages_by_stream = runner.get_records_from_target_output()
        records_by_stream = {}
        for stream_name in self.expected_sync_streams():
            records_by_stream[stream_name] = [
                x for x in messages_by_stream[stream_name]['messages']
                if x.get('action') == 'upsert'
            ]

        # assert that each of the streams that we synced are the ones that we expect to see
        record_count_by_stream = runner.examine_target_output_file(
            self, conn_id, self.expected_sync_streams(), self.expected_pks())

        # Verify that we got at least 6 records due to changes
        # (could be more due to overlap in gte oplog clause)
        for k, v in record_count_by_stream.items():
            self.assertGreaterEqual(v, 6)

        # Verify that we got 2 records with _SDC_DELETED_AT
        self.assertEqual(
            2,
            len([
                x['data'] for x in records_by_stream['simple_coll_1']
                if x['data'].get('_sdc_deleted_at')
            ]))
        self.assertEqual(
            2,
            len([
                x['data'] for x in records_by_stream['simple_coll_2']
                if x['data'].get('_sdc_deleted_at')
            ]))
        # Verify that the _id of the records sent are the same set as the
        # _ids of the documents changed
        actual = set([
            ObjectId(x['data']['_id'])
            for x in records_by_stream['simple_coll_1']
        ]).union(
            set([
                ObjectId(x['data']['_id'])
                for x in records_by_stream['simple_coll_2']
            ]))
        self.assertEqual(changed_ids, actual)
Exemplo n.º 13
0
    def test_run(self):
        """
        Verify that a full sync can send capture all data and send it in the correct format
        for integer and boolean (bit) data.
        Verify that the fist sync sends an activate immediately.
        Verify that the table version is incremented up
        """

        print("running test {}".format(self.name()))

        conn_id = self.create_connection()

        # run in check mode
        check_job_name = runner.run_check_mode(self, conn_id)

        # verify check  exit codes
        exit_status = menagerie.get_exit_status(conn_id, check_job_name)
        menagerie.verify_check_exit_status(self, exit_status, check_job_name)

        # get the catalog information of discovery
        found_catalogs = menagerie.get_catalogs(conn_id)
        additional_md = [{
            "breadcrumb": [],
            "metadata": {
                'replication-method': 'LOG_BASED'
            }
        }]
        BaseTapTest.select_all_streams_and_fields(conn_id,
                                                  found_catalogs,
                                                  additional_md=additional_md)

        # clear state
        menagerie.set_state(conn_id, {})
        sync_job_name = runner.run_sync_mode(self, conn_id)

        # verify tap and target exit codes
        exit_status = menagerie.get_exit_status(conn_id, sync_job_name)
        menagerie.verify_sync_exit_status(self, exit_status, sync_job_name)

        # verify record counts of streams
        record_count_by_stream = runner.examine_target_output_file(
            self, conn_id, self.expected_streams(),
            self.expected_primary_keys_by_stream_id())
        expected_count = {
            k: len(v['values'])
            for k, v in self.expected_metadata().items()
        }
        # self.assertEqual(record_count_by_stream, expected_count)

        # verify records match on the first sync
        records_by_stream = runner.get_records_from_target_output()

        table_version = dict()
        for stream in self.expected_streams():
            with self.subTest(stream=stream):
                stream_expected_data = self.expected_metadata()[stream]
                table_version[stream] = records_by_stream[stream][
                    'table_version']

                # verify on the first sync you get
                # activate version message before and after all data for the full table
                # and before the logical replication part
                if records_by_stream[stream]['messages'][-1].get("data"):
                    last_row_data = True
                else:
                    last_row_data = False

                self.assertEqual(
                    records_by_stream[stream]['messages'][0]['action'],
                    'activate_version')
                self.assertEqual(
                    records_by_stream[stream]['messages'][-2]['action'],
                    'activate_version')
                if last_row_data:
                    self.assertEqual(
                        records_by_stream[stream]['messages'][-3]['action'],
                        'activate_version')
                else:
                    self.assertEqual(
                        records_by_stream[stream]['messages'][-1]['action'],
                        'activate_version')
                self.assertEqual(
                    len([
                        m for m in records_by_stream[stream]['messages'][1:]
                        if m["action"] == "activate_version"
                    ]),
                    2,
                    msg=
                    "Expect 2 more activate version messages for end of full table and beginning of log based"
                )

                column_names = [
                    list(field_data.keys())[0]
                    for field_data in stream_expected_data[self.FIELDS]
                ]

                expected_messages = [{
                    "action": "upsert",
                    "data": {
                        column: value
                        for column, value in list(
                            zip(column_names, stream_expected_data[self.VALUES]
                                [row]))
                    }
                } for row in range(len(stream_expected_data[self.VALUES]))]

                # Verify all data is correct for the full table part
                if last_row_data:
                    final_row = -3
                else:
                    final_row = -2

                for expected_row, actual_row in list(
                        zip(expected_messages, records_by_stream[stream]
                            ['messages'][1:final_row])):
                    with self.subTest(expected_row=expected_row):

                        self.assertEqual(actual_row["action"], "upsert")
                        self.assertEqual(
                            len(expected_row["data"].keys()),
                            len(actual_row["data"].keys()),
                            msg="there are not the same number of columns")
                        for column_name, expected_value in expected_row[
                                "data"].items():
                            column_index = [
                                list(key.keys())[0] for key in
                                self.expected_metadata()[stream][self.FIELDS]
                            ].index(column_name)
                            if self.expected_metadata()[stream][self.FIELDS][column_index][column_name][self.DATATYPE] \
                                    in ("real", "float") \
                                    and actual_row["data"][column_name] is not None:
                                self.assertEqual(
                                    type(actual_row["data"][column_name]),
                                    Decimal,
                                    msg=
                                    "float value is not represented as a number"
                                )
                                self.assertEqual(
                                    float(str(float32(expected_value))),
                                    float(
                                        str(
                                            float32(actual_row["data"]
                                                    [column_name]))),
                                    msg=
                                    "single value of {} doesn't match actual {}"
                                    .format(
                                        float(str(float32(expected_value))),
                                        float(
                                            str(
                                                float32(actual_row["data"]
                                                        [column_name])))))
                            else:
                                self.assertEqual(
                                    expected_value,
                                    actual_row["data"][column_name],
                                    msg="expected: {} != actual {}".format(
                                        expected_row, actual_row))

                # Verify all data is correct for the log replication part if sent
                if records_by_stream[stream]['messages'][-1].get("data"):
                    for column_name, expected_value in expected_messages[-1][
                            "data"].items():
                        if isinstance(expected_value, float):
                            self.assertEqual(
                                type(records_by_stream[stream]['messages'][-1]
                                     ["data"][column_name]),
                                Decimal,
                                msg="float value is not represented as a number"
                            )
                            self.assertEqual(
                                float(str(float32(expected_value))),
                                float(
                                    str(
                                        float32(records_by_stream[stream]
                                                ['messages'][-1]["data"]
                                                [column_name]))),
                                msg="single value of {} doesn't match actual {}"
                                .format(
                                    float(str(float32(expected_value))),
                                    float(
                                        str(
                                            float32(records_by_stream[stream]
                                                    ['messages'][-1]["data"]
                                                    [column_name])))))
                        else:
                            self.assertEqual(
                                expected_value,
                                records_by_stream[stream]['messages'][-1]
                                ["data"][column_name],
                                msg="expected: {} != actual {}".format(
                                    expected_row, actual_row))

                print("records are correct for stream {}".format(stream))

                # verify state and bookmarks
                state = menagerie.get_state(conn_id)
                bookmark = state['bookmarks'][stream]

                self.assertIsNone(
                    state.get('currently_syncing'),
                    msg="expected state's currently_syncing to be None")
                self.assertIsNotNone(
                    bookmark.get('current_log_version'),
                    msg=
                    "expected bookmark to have current_log_version because we are using log replication"
                )
                self.assertTrue(bookmark['initial_full_table_complete'],
                                msg="expected full table to be complete")
                inital_log_version = bookmark['current_log_version']

                self.assertEqual(
                    bookmark['version'],
                    table_version[stream],
                    msg="expected bookmark for stream to match version")

                expected_schemas = self.expected_metadata()[stream]['schema']
                self.assertEqual(records_by_stream[stream]['schema'],
                                 expected_schemas,
                                 msg="expected: {} != actual: {}".format(
                                     expected_schemas,
                                     records_by_stream[stream]['schema']))

        # ----------------------------------------------------------------------
        # invoke the sync job AGAIN and after insert, update, delete or rows
        # ----------------------------------------------------------------------

        database_name = "data_types_database"
        schema_name = "dbo"
        table_name = "float_precisions"
        column_name = ["pk", "float_24", "float_53", "real_24_bits"]
        insert_value = [(14, 100.1, 100.1, 100.1)]
        update_value = [(1, 101.2, 101.2, 101.2)]
        delete_value = [(5, )]
        query_list = (insert(database_name, schema_name, table_name,
                             insert_value))
        query_list.extend(
            delete_by_pk(database_name, schema_name, table_name, delete_value,
                         column_name[:1]))
        query_list.extend(
            update_by_pk(database_name, schema_name, table_name, update_value,
                         column_name))
        mssql_cursor_context_manager(*query_list)
        insert_value = [(14, 100.1, 100.1, 100.1, None)]
        update_value = [(1, 101.2, 101.2, 101.2, None)]
        delete_value = [(5, None, None, None, datetime.utcnow())]
        self.EXPECTED_METADATA["data_types_database_dbo_float_precisions"]["values"] = \
            [self.expected_metadata()["data_types_database_dbo_float_precisions"]["values"][-1]] + \
            insert_value + delete_value + update_value
        self.EXPECTED_METADATA["data_types_database_dbo_float_precisions"][
            "fields"].append({
                "_sdc_deleted_at": {
                    'sql-datatype': 'datetime',
                    'selected-by-default': True,
                    'inclusion': 'automatic'
                }
            })

        sync_job_name = runner.run_sync_mode(self, conn_id)

        # verify tap and target exit codes
        exit_status = menagerie.get_exit_status(conn_id, sync_job_name)
        menagerie.verify_sync_exit_status(self, exit_status, sync_job_name)
        record_count_by_stream = runner.examine_target_output_file(
            self, conn_id, self.expected_streams(),
            self.expected_primary_keys_by_stream_id())
        expected_count = {
            k: len(v['values'])
            for k, v in self.expected_metadata().items()
        }
        self.assertEqual(record_count_by_stream, expected_count)
        records_by_stream = runner.get_records_from_target_output()

        for stream in self.expected_streams():
            with self.subTest(stream=stream):
                stream_expected_data = self.expected_metadata()[stream]
                new_table_version = records_by_stream[stream]['table_version']

                # verify on a subsequent sync you get activate version message only after all data
                self.assertEqual(
                    records_by_stream[stream]['messages'][0]['action'],
                    'activate_version')
                self.assertTrue(
                    all([
                        message["action"] == "upsert" for message in
                        records_by_stream[stream]['messages'][1:]
                    ]))

                column_names = [
                    list(field_data.keys())[0]
                    for field_data in stream_expected_data[self.FIELDS]
                ]

                expected_messages = [{
                    "action": "upsert",
                    "data": {
                        column: value
                        for column, value in list(
                            zip(column_names, stream_expected_data[self.VALUES]
                                [row]))
                    }
                } for row in range(len(stream_expected_data[self.VALUES]))]

                # remove sequences from actual values for comparison
                [
                    message.pop("sequence")
                    for message in records_by_stream[stream]['messages'][1:]
                ]

                # Verify all data is correct
                for expected_row, actual_row in list(
                        zip(expected_messages,
                            records_by_stream[stream]['messages'][1:])):
                    with self.subTest(expected_row=expected_row):
                        self.assertEqual(actual_row["action"], "upsert")

                        # we only send the _sdc_deleted_at column for deleted rows
                        self.assertGreaterEqual(
                            len(expected_row["data"].keys()),
                            len(actual_row["data"].keys()),
                            msg="there are not the same number of columns")

                        for column_name, expected_value in expected_row[
                                "data"].items():
                            if column_name != "_sdc_deleted_at":
                                column_index = [
                                    list(key.keys())[0]
                                    for key in self.expected_metadata()[stream]
                                    [self.FIELDS]
                                ].index(column_name)
                                if self.expected_metadata()[stream][self.FIELDS][column_index][column_name][
                                    self.DATATYPE] \
                                        in ("real", "float") \
                                        and actual_row["data"][column_name] is not None:
                                    self.assertEqual(
                                        type(actual_row["data"][column_name]),
                                        Decimal,
                                        msg=
                                        "float value is not represented as a number"
                                    )
                                    self.assertEqual(
                                        float(str(float32(expected_value))),
                                        float(
                                            str(
                                                float32(actual_row["data"]
                                                        [column_name]))),
                                        msg=
                                        "single value of {} doesn't match actual {}"
                                        .format(
                                            float(str(
                                                float32(expected_value))),
                                            float(
                                                str(
                                                    float32(actual_row["data"]
                                                            [column_name])))))
                                else:
                                    self.assertEqual(
                                        expected_value,
                                        actual_row["data"][column_name],
                                        msg="expected: {} != actual {}".format(
                                            expected_row, actual_row))
                            elif expected_value:
                                # we have an expected value for a deleted row
                                try:
                                    actual_value = datetime.strptime(
                                        actual_row["data"][column_name],
                                        "%Y-%m-%dT%H:%M:%S.%fZ")
                                except ValueError:
                                    actual_value = datetime.strptime(
                                        actual_row["data"][column_name],
                                        "%Y-%m-%dT%H:%M:%SZ")
                                self.assertGreaterEqual(
                                    actual_value,
                                    expected_value - timedelta(seconds=15))
                                self.assertLessEqual(
                                    actual_value,
                                    expected_value + timedelta(seconds=15))
                            else:
                                # the row wasn't deleted so we can either not pass the column or it can be None
                                self.assertIsNone(
                                    actual_row["data"].get(column_name))

                print("records are correct for stream {}".format(stream))

                # verify state and bookmarks
                state = menagerie.get_state(conn_id)
                bookmark = state['bookmarks'][stream]

                self.assertIsNone(
                    state.get('currently_syncing'),
                    msg="expected state's currently_syncing to be None")
                self.assertIsNotNone(
                    bookmark.get('current_log_version'),
                    msg=
                    "expected bookmark to have current_log_version because we are using log replication"
                )
                self.assertTrue(bookmark['initial_full_table_complete'],
                                msg="expected full table to be complete")
                new_log_version = bookmark['current_log_version']
                self.assertGreater(new_log_version,
                                   inital_log_version,
                                   msg='expected log version to increase')

                self.assertEqual(
                    bookmark['version'],
                    table_version[stream],
                    msg="expected bookmark for stream to match version")
                self.assertEqual(
                    bookmark['version'],
                    new_table_version,
                    msg="expected bookmark for stream to match version")

                expected_schemas = self.expected_metadata()[stream]['schema']
                self.assertEqual(records_by_stream[stream]['schema'],
                                 expected_schemas,
                                 msg="expected: {} != actual: {}".format(
                                     expected_schemas,
                                     records_by_stream[stream]['schema']))