예제 #1
0
    def test_run(self):
        """
        Verify that a full sync can send capture all data and send it in the correct format
        for integer and boolean (bit) data.
        Verify that the fist sync sends an activate immediately.
        Verify that the table version is incremented up
        """
        print("running test {}".format(self.name()))

        conn_id = self.create_connection()

        # run in check mode
        check_job_name = runner.run_check_mode(self, conn_id)

        # verify check  exit codes
        exit_status = menagerie.get_exit_status(conn_id, check_job_name)
        menagerie.verify_check_exit_status(self, exit_status, check_job_name)

        # get the catalog information of discovery
        found_catalogs = menagerie.get_catalogs(conn_id)
        additional_md = [{"breadcrumb": [], "metadata": {'replication-method': 'LOG_BASED'}}]

        # Don't select unsupported data types
        non_selected_properties = ["nvarchar_text", "varchar_text", "varbinary_data",
                                   "geospacial", "geospacial_map", "markup", "tree",
                                   "variant", "SpecialPurposeColumns", "started_at", "ended_at"]
        BaseTapTest.select_all_streams_and_fields(
            conn_id, found_catalogs, additional_md=additional_md,
            non_selected_properties=non_selected_properties)

        # clear state
        menagerie.set_state(conn_id, {})
        sync_job_name = runner.run_sync_mode(self, conn_id)

        # verify tap and target exit codes
        exit_status = menagerie.get_exit_status(conn_id, sync_job_name)
        menagerie.verify_sync_exit_status(self, exit_status, sync_job_name)

        # verify record counts of streams
        record_count_by_stream = runner.examine_target_output_file(
            self, conn_id, self.expected_streams(), self.expected_primary_keys_by_stream_id())
        expected_count = {k: len(v['values']) for k, v in self.expected_metadata().items()}
        # self.assertEqual(record_count_by_stream, expected_count)

        # verify records match on the first sync
        records_by_stream = runner.get_records_from_target_output()

        table_version = dict()
        for stream in self.expected_streams():
            with self.subTest(stream=stream):
                stream_expected_data = self.expected_metadata()[stream]
                table_version[stream] = records_by_stream[stream]['table_version']

                # verify on the first sync you get
                # activate version message before and after all data for the full table
                # and before the logical replication part
                if records_by_stream[stream]['messages'][-1].get("data"):
                    last_row_data = True
                else:
                    last_row_data = False

                self.assertEqual(
                    records_by_stream[stream]['messages'][0]['action'],
                    'activate_version')
                self.assertEqual(
                    records_by_stream[stream]['messages'][-2]['action'],
                    'activate_version')
                if last_row_data:
                    self.assertEqual(
                        records_by_stream[stream]['messages'][-3]['action'],
                        'activate_version')
                else:
                    self.assertEqual(
                        records_by_stream[stream]['messages'][-1]['action'],
                        'activate_version')
                self.assertEqual(
                    len([m for m in records_by_stream[stream]['messages'][1:] if m["action"] == "activate_version"]),
                    2,
                    msg="Expect 2 more activate version messages for end of full table and beginning of log based")

                column_names = [
                    list(field_data.keys())[0] for field_data in stream_expected_data[self.FIELDS]
                ]

                expected_messages = [
                    {
                        "action": "upsert", "data":
                        {
                            column: value for column, value
                            in list(zip(column_names, stream_expected_data[self.VALUES][row]))
                            if column not in non_selected_properties
                        }
                    } for row in range(len(stream_expected_data[self.VALUES]))
                ]

                # Verify all data is correct for the full table part
                if last_row_data:
                    final_row = -3
                else:
                    final_row = -2

                for expected_row, actual_row in list(
                        zip(expected_messages, records_by_stream[stream]['messages'][1:final_row])):
                    with self.subTest(expected_row=expected_row):

                        self.assertEqual(actual_row["action"], "upsert")
                        self.assertEqual(len(expected_row["data"].keys()), len(actual_row["data"].keys()),
                                         msg="there are not the same number of columns")
                        for column_name, expected_value in expected_row["data"].items():
                            self.assertEqual(expected_value, actual_row["data"][column_name],
                                             msg="expected: {} != actual {}".format(
                                                 expected_row, actual_row))

                # Verify all data is correct for the log replication part if sent
                if records_by_stream[stream]['messages'][-1].get("data"):
                    for column_name, expected_value in expected_messages[-1]["data"].items():
                        self.assertEqual(expected_value,
                                         records_by_stream[stream]['messages'][-1]["data"][column_name],
                                         msg="expected: {} != actual {}".format(
                                             expected_row, actual_row))

                print("records are correct for stream {}".format(stream))

                # verify state and bookmarks
                state = menagerie.get_state(conn_id)
                bookmark = state['bookmarks'][stream]

                self.assertIsNone(state.get('currently_syncing'), msg="expected state's currently_syncing to be None")
                self.assertIsNotNone(
                    bookmark.get('current_log_version'),
                    msg="expected bookmark to have current_log_version because we are using log replication")
                self.assertTrue(bookmark['initial_full_table_complete'], msg="expected full table to be complete")
                inital_log_version = bookmark['current_log_version']

                self.assertEqual(bookmark['version'], table_version[stream],
                                 msg="expected bookmark for stream to match version")

                expected_schemas = self.expected_metadata()[stream]['schema']
                self.assertEqual(records_by_stream[stream]['schema'],
                                 expected_schemas,
                                 msg="expected: {} != actual: {}".format(expected_schemas,
                                                                         records_by_stream[stream]['schema']))

        # ----------------------------------------------------------------------
        # invoke the sync job AGAIN and after insert, update, delete or rows
        # ----------------------------------------------------------------------

        database_name = "data_types_database"
        schema_name = "dbo"
        table_name = "text_and_image_deprecated_soon"
        column_name = ["pk", "nvarchar_text", "varchar_text", "varbinary_data",
                       "rowversion_synonym_timestamp"]
        insert_value = [(2, "JKL", "MNO", "PQR".encode('utf-8'))]
        update_value = [(1, "JKL", "MNO", "PQR".encode('utf-8'))]
        delete_value = [(0, )]
        query_list = (insert(database_name, schema_name, table_name, insert_value, column_name[:-1]))
        query_list.extend(delete_by_pk(database_name, schema_name, table_name, delete_value, column_name[:1]))
        query_list.extend(update_by_pk(database_name, schema_name, table_name, update_value, column_name))
        mssql_cursor_context_manager(*query_list)
        values = insert_value + update_value
        rows = mssql_cursor_context_manager(*[
            "select rowversion_synonym_timestamp from data_types_database.dbo.text_and_image_deprecated_soon "
            "where pk in (0,1,2) order by pk desc"])
        rows = [tuple(row) for row in rows]
        rows = [("0x{}".format(value.hex().upper()), ) for value, in rows]
        row_with_version = [x[0] + x[1] + (None, ) for x in zip(values, rows)]
        row_with_version.append((0, None, None, None, None, datetime.utcnow()))
        row_with_version[1], row_with_version[2] = row_with_version[2], row_with_version[1]
        self.EXPECTED_METADATA['data_types_database_dbo_text_and_image_deprecated_soon']['values'] = row_with_version
        self.EXPECTED_METADATA["data_types_database_dbo_text_and_image_deprecated_soon"]["fields"].append(
            {"_sdc_deleted_at": {
                'sql-datatype': 'datetime', 'selected-by-default': True, 'inclusion': 'automatic'}}
        )

        database_name = "data_types_database"
        schema_name = "dbo"
        table_name = "weirdos"
        column_name = [
            "pk", "geospacial", "geospacial_map", "markup", "guid", "tree",
            "variant", "SpecialPurposeColumns", "version"]
        insert_value = [(3, None, None, None, str(uuid.uuid1()).upper(), None, None, None)]
        update_value = [(1, None, None, None, str(uuid.uuid1()).upper(), None, None, None)]
        delete_value = [(0,)]
        query_list = (insert(database_name, schema_name, table_name, insert_value, column_name[:-1]))
        query_list.extend(delete_by_pk(database_name, schema_name, table_name, delete_value, column_name[:1]))
        query_list.extend(update_by_pk(database_name, schema_name, table_name, update_value, column_name))
        mssql_cursor_context_manager(*query_list)
        values = insert_value + update_value
        rows = mssql_cursor_context_manager(*[
            "select version from data_types_database.dbo.weirdos "
            "where pk in (0,1,3) order by pk desc"])
        rows = [tuple(row) for row in rows]
        rows = [("0x{}".format(value.hex().upper()), ) for value, in rows]
        row_with_version = [x[0] + x[1] + (None, ) for x in zip(values, rows)]
        row_with_version.append((0, None, None, None, None, None, None, None, None, datetime.utcnow()))
        row_with_version[1], row_with_version[2] = row_with_version[2], row_with_version[1]
        self.EXPECTED_METADATA['data_types_database_dbo_weirdos']['values'] = row_with_version
        self.EXPECTED_METADATA["data_types_database_dbo_weirdos"]["fields"].append(
            {"_sdc_deleted_at": {
                'sql-datatype': 'datetime', 'selected-by-default': True, 'inclusion': 'automatic'}}
        )

        database_name = "data_types_database"
        schema_name = "dbo"
        table_name = "computed_columns"
        column_name = ["pk", "started_at", "ended_at", "durations_days"]
        insert_value = [(2, datetime(1980, 5, 30, 16), datetime.now())]
        update_value = [(1, datetime(1942, 11, 30), datetime(2017, 2, 12))]
        delete_value = [(0,)]
        query_list = (insert(database_name, schema_name, table_name, insert_value, column_name[:-1]))
        query_list.extend(delete_by_pk(database_name, schema_name, table_name, delete_value, column_name[:1]))
        query_list.extend(update_by_pk(database_name, schema_name, table_name, update_value, column_name))
        mssql_cursor_context_manager(*query_list)
        values = insert_value + update_value  # + [delete_value[0] + (None, None)]
        rows = mssql_cursor_context_manager(
            *["select durations_days from data_types_database.dbo.computed_columns "
              "where pk in (0,1,2) order by pk desc"])
        rows = [tuple(row) for row in rows]
        row_with_duration = [x[0] + x[1] + (None, ) for x in zip(values, rows)]
        row_with_duration.append((0, None, None, None, datetime.utcnow()))
        row_with_duration[1], row_with_duration[2] = row_with_duration[2], row_with_duration[1]
        self.EXPECTED_METADATA['data_types_database_dbo_computed_columns']['values'] = row_with_duration
        self.EXPECTED_METADATA["data_types_database_dbo_computed_columns"]["fields"].append(
            {"_sdc_deleted_at": {
                'sql-datatype': 'datetime', 'selected-by-default': True, 'inclusion': 'automatic'}}
        )

        sync_job_name = runner.run_sync_mode(self, conn_id)

        # verify tap and target exit codes
        exit_status = menagerie.get_exit_status(conn_id, sync_job_name)
        menagerie.verify_sync_exit_status(self, exit_status, sync_job_name)
        record_count_by_stream = runner.examine_target_output_file(
            self, conn_id, self.expected_streams(), self.expected_primary_keys_by_stream_id())
        expected_count = {k: len(v['values']) for k, v in self.expected_metadata().items()}
        self.assertEqual(record_count_by_stream, expected_count)
        records_by_stream = runner.get_records_from_target_output()

        for stream in self.expected_streams():
            with self.subTest(stream=stream):
                stream_expected_data = self.expected_metadata()[stream]
                new_table_version = records_by_stream[stream]['table_version']

                # verify on a subsequent sync you get activate version message only after all data
                self.assertEqual(
                    records_by_stream[stream]['messages'][0]['action'],
                    'activate_version')
                self.assertTrue(all(
                    [message["action"] == "upsert" for message in records_by_stream[stream]['messages'][1:]]
                ))

                column_names = [
                    list(field_data.keys())[0] for field_data in stream_expected_data[self.FIELDS]
                ]

                expected_messages = [
                    {
                        "action": "upsert", "data":
                        {
                            column: value for column, value
                            in list(zip(column_names, stream_expected_data[self.VALUES][row]))
                            if column not in non_selected_properties
                        }
                    } for row in range(len(stream_expected_data[self.VALUES]))
                ]

                # remove sequences from actual values for comparison
                [message.pop("sequence") for message
                 in records_by_stream[stream]['messages'][1:]]

                # Verify all data is correct
                for expected_row, actual_row in list(
                        zip(expected_messages, records_by_stream[stream]['messages'][1:])):
                    with self.subTest(expected_row=expected_row):
                        self.assertEqual(actual_row["action"], "upsert")

                        # we only send the _sdc_deleted_at column for deleted rows
                        self.assertGreaterEqual(len(expected_row["data"].keys()), len(actual_row["data"].keys()),
                                         msg="there are not the same number of columns")

                        for column_name, expected_value in expected_row["data"].items():
                            if column_name != "_sdc_deleted_at":
                                self.assertEqual(expected_value, actual_row["data"][column_name],
                                                 msg="expected: {} != actual {}".format(
                                                     expected_row, actual_row))
                            elif expected_value:
                                # we have an expected value for a deleted row
                                try:
                                    actual_value = datetime.strptime(actual_row["data"][column_name],
                                                                     "%Y-%m-%dT%H:%M:%S.%fZ")
                                except ValueError:
                                    actual_value = datetime.strptime(actual_row["data"][column_name],
                                                                     "%Y-%m-%dT%H:%M:%SZ")
                                self.assertGreaterEqual(actual_value, expected_value - timedelta(seconds=15))
                                self.assertLessEqual(actual_value, expected_value + timedelta(seconds=15))
                            else:
                                # the row wasn't deleted so we can either not pass the column or it can be None
                                self.assertIsNone(actual_row["data"].get(column_name))

                print("records are correct for stream {}".format(stream))

                # verify state and bookmarks
                state = menagerie.get_state(conn_id)
                bookmark = state['bookmarks'][stream]

                self.assertIsNone(state.get('currently_syncing'), msg="expected state's currently_syncing to be None")
                self.assertIsNotNone(
                    bookmark.get('current_log_version'),
                    msg="expected bookmark to have current_log_version because we are using log replication")
                self.assertTrue(bookmark['initial_full_table_complete'], msg="expected full table to be complete")
                new_log_version = bookmark['current_log_version']
                self.assertGreater(new_log_version, inital_log_version,
                                   msg='expected log version to increase')

                self.assertEqual(bookmark['version'], table_version[stream],
                                 msg="expected bookmark for stream to match version")
                self.assertEqual(bookmark['version'], new_table_version,
                                 msg="expected bookmark for stream to match version")

                expected_schemas = self.expected_metadata()[stream]['schema']
                self.assertEqual(records_by_stream[stream]['schema'],
                                 expected_schemas,
                                 msg="expected: {} != actual: {}".format(expected_schemas,
                                                                         records_by_stream[stream]['schema']))
예제 #2
0
    def setUpClass(cls) -> None:
        """Create the expected schema in the test database"""

        drop_all_user_databases()
        database_name = "data_types_database"
        schema_name = "dbo"

        query_list = list(create_database(database_name, "Latin1_General_CS_AS"))
        query_list.extend(enable_database_tracking(database_name))

        text_values = [
            (0, None, None, None),
            (1, "abc", "def", "ghi".encode('utf-8'))
        ]
        text_schema = {
            'selected': True,
            'properties': {
                'pk': {
                    'inclusion': 'automatic',
                    'maximum': 2147483647,
                    'minimum': -2147483648,
                    'type': ['integer'],
                    'selected': True},
                'rowversion_synonym_timestamp': {'inclusion': 'available', 'selected': True, 'type': ['string', 'null']},
                'varchar_text': {},
                'nvarchar_text': {},
                'varbinary_data': {},
                "_sdc_deleted_at": {'format': 'date-time', 'type': ['string', 'null']}},
            'type': 'object'}

        other_values = [
            (0, None, None, None, "827376B0-AEF4-11E9-8002-0800276BC1DF", None, None, None),
            (1, None, None, None, "ACC9A986-AEF4-11E9-8002-0800276BC1DF", None, None, None),
            (2, None, None, None, "B792681C-AEF4-11E9-8002-0800276BC1DF", None, None, None)
        ]
        other_schema = {
            'selected': True,
            'properties': {
                'markup': {},
                'variant': {},
                'geospacial': {},
                'SpecialPurposeColumns': {},
                'tree': {},
                'guid': {
                    'inclusion': 'available',
                    'selected': True,
                    'pattern': '[A-F0-9]{8}-([A-F0-9]{4}-){3}[A-F0-9]{12}',
                    'type': ['string', 'null']},
                'geospacial_map': {},
                'pk': {
                    'inclusion': 'automatic',
                    'maximum': 2147483647,
                    'minimum': -2147483648,
                    'type': ['integer'],
                    'selected': True},
                'version': {'inclusion': 'available', 'selected': True, 'type': ['string', 'null']},
                "_sdc_deleted_at": {'format': 'date-time', 'type': ['string', 'null']}},
            'type': 'object'}

        comp_values = [
            (0, datetime(1970, 7, 8, 3), datetime.now()),
            (1, datetime(1970, 1, 1, 0), datetime.now())
        ]
        comp_schema = {
            'selected': True,
            'properties': {
                'started_at': {
                    'selected': False,
                    'type': ['string', 'null'],
                    'inclusion': 'available',
                    'format': 'date-time'},
                'durations_days': {
                    'inclusion': 'available',
                    'maximum': 2147483647,
                    'minimum': -2147483648,
                    'type': ['integer', 'null'],
                    'selected': True},
                'ended_at': {
                    'format': 'date-time',
                    'inclusion': 'available',
                    'type': ['string', 'null'],
                    'selected': False},
                'pk': {
                    'inclusion': 'automatic',
                    'maximum': 2147483647,
                    'minimum': -2147483648,
                    'type': ['integer'],
                    'selected': True},
                "_sdc_deleted_at": {'format': 'date-time', 'type': ['string', 'null']}},
            'type': 'object'}

        cls.EXPECTED_METADATA = {
            'data_types_database_dbo_text_and_image_deprecated_soon': {
                'is-view': False,
                'schema-name': schema_name,
                'row-count': 0,
                'values': text_values,
                'table-key-properties': {'pk'},
                'selected': None,
                'database-name': database_name,
                'stream_name': 'text_and_image_deprecated_soon',
                'fields': [
                    {"pk": {'sql-datatype': 'int', 'selected-by-default': True, 'inclusion': 'automatic'}},
                    {"nvarchar_text": {'sql-datatype': 'ntext', 'selected-by-default': False,
                                       'inclusion': 'unavailable'}},
                    {"varchar_text": {'sql-datatype': 'text', 'selected-by-default': False,
                                      'inclusion': 'unavailable'}},
                    {"varbinary_data": {'sql-datatype': 'image', 'selected-by-default': False,
                                        'inclusion': 'unavailable'}},
                    {"rowversion_synonym_timestamp": {'sql-datatype': 'timestamp', 'selected-by-default': True,
                                                      'inclusion': 'available'}}],
                'schema': text_schema},
            'data_types_database_dbo_weirdos': {
                'is-view': False,
                'schema-name': schema_name,
                'row-count': 0,
                'values': other_values,
                'table-key-properties': {'pk'},
                'selected': None,
                'database-name': database_name,
                'stream_name': 'weirdos',
                'fields': [
                    {"pk": {'sql-datatype': 'int', 'selected-by-default': True, 'inclusion': 'automatic'}},
                    {"geospacial": {'sql-datatype': 'geometry', 'selected-by-default': False,
                                    'inclusion': 'unavailable'}},
                    {"geospacial_map": {'sql-datatype': 'geography', 'selected-by-default': False,
                                        'inclusion': 'unavailable'}},
                    {"markup": {'sql-datatype': 'xml', 'selected-by-default': False, 'inclusion': 'unavailable'}},
                    {"guid": {'sql-datatype': 'uniqueidentifier', 'selected-by-default': True,
                              'inclusion': 'available'}},
                    {"tree": {'sql-datatype': 'hierarchyid', 'selected-by-default': False, 'inclusion': 'unavailable'}},
                    {"variant": {'sql-datatype': 'sql_variant', 'selected-by-default': False,
                                 'inclusion': 'unavailable'}},
                    {"SpecialPurposeColumns": {'sql-datatype': 'xml', 'selected-by-default': False,
                                               'inclusion': 'unavailable'}},
                    {"version": {'sql-datatype': 'timestamp', 'selected-by-default': True, 'inclusion': 'available'}}],
                'schema': other_schema},
            'data_types_database_dbo_computed_columns': {
                'is-view': False,
                'schema-name': schema_name,
                'row-count': 0,
                'values': comp_values,
                'table-key-properties': {'pk'},
                'selected': None,
                'database-name': database_name,
                'stream_name': 'computed_columns',
                'fields': [
                    {"pk": {'sql-datatype': 'int', 'selected-by-default': True, 'inclusion': 'automatic'}},
                    {"started_at": {'sql-datatype': 'datetimeoffset', 'selected-by-default': True,
                                    'inclusion': 'available'}},
                    {"ended_at": {'sql-datatype': 'datetimeoffset', 'selected-by-default': True,
                                  'inclusion': 'available'}},
                    {"durations_days": {'sql-datatype': 'int', 'selected-by-default': True,
                                        'inclusion': 'unavailable'}}],
                'schema': comp_schema},
        }

        # test timestamp and usnupported
        table_name = "text_and_image_deprecated_soon"
        column_name = ["pk", "nvarchar_text", "varchar_text", "varbinary_data",
                       "rowversion_synonym_timestamp"]
        column_type = ["int", "ntext", "text", "image", "timestamp"]
        primary_key = {"pk"}
        column_def = [" ".join(x) for x in list(zip(column_name, column_type))]
        query_list.extend(create_table(database_name, schema_name, table_name, column_def,
                                       primary_key=primary_key, tracking=True))
        query_list.extend(insert(database_name, schema_name, table_name, text_values, column_name[:-1]))

        # test uniqueidentifier and rowversion
        table_name = "weirdos"
        column_name = [
            "pk", "geospacial", "geospacial_map", "markup", "guid", "tree",
            "variant", "SpecialPurposeColumns", "version"
        ]
        column_type = [
            "int", "geometry", "geography", "xml", "uniqueidentifier", "hierarchyid",
            "sql_variant", "xml COLUMN_SET FOR ALL_SPARSE_COLUMNS", "rowversion"
        ]
        primary_key = {"pk"}
        column_def = [" ".join(x) for x in list(zip(column_name, column_type))]
        query_list.extend(create_table(database_name, schema_name, table_name, column_def,
                                       primary_key=primary_key, tracking=True))
        # not sure why I have to do this but getting error - Parameter information is missing from a user-defined type.
        for value in other_values:
            query_list.extend(insert(database_name, schema_name, table_name, [value], column_name[:-1]))

        table_name = "computed_columns"
        column_name = ["pk", "started_at", "ended_at", "durations_days"]
        column_type = ["int", "datetimeoffset", "datetimeoffset", "AS DATEDIFF(day, started_at, ended_at)"]
        primary_key = {"pk"}
        column_def = [" ".join(x) for x in list(zip(column_name, column_type))]
        query_list.extend(create_table(database_name, schema_name, table_name, column_def,
                                       primary_key=primary_key, tracking=True))
        query_list.extend(insert(database_name, schema_name, table_name, comp_values, column_name[:-1]))
        mssql_cursor_context_manager(*query_list)

        # update values with rowversions
        rows = mssql_cursor_context_manager(*["select version from data_types_database.dbo.weirdos order by pk"])
        rows = ["0x{}".format(value.hex().upper()) for value, in rows]
        cls.EXPECTED_METADATA['data_types_database_dbo_weirdos']['values'] = \
            [other_values[row] + (version,) for row, version in enumerate(rows)]

        rows = mssql_cursor_context_manager(*[
            "select rowversion_synonym_timestamp from data_types_database.dbo.text_and_image_deprecated_soon order by pk"])
        rows = ["0x{}".format(value.hex().upper()) for value, in rows]
        cls.EXPECTED_METADATA['data_types_database_dbo_text_and_image_deprecated_soon']['values'] = \
            [text_values[row] + (version,) for row, version in enumerate(rows)]

        rows = mssql_cursor_context_manager(
            *["select durations_days from data_types_database.dbo.computed_columns order by pk"])
        cls.EXPECTED_METADATA['data_types_database_dbo_computed_columns']['values'] = \
            [comp_values[row] + tuple(version) for row, version in enumerate(rows)]

        cls.expected_metadata = cls.discovery_expected_metadata
예제 #3
0
    def setUpClass(cls) -> None:
        """Create the expected schema in the test database"""
        drop_all_user_databases()
        database_name = "data_types_database"
        schema_name = "dbo"

        # use all valid unicode characters
        chars = list(range(0, 55296))
        chars.extend(
            range(57344, 65534)
        )  # TODO - BUG https://stitchdata.atlassian.net/browse/SRCE-1217
        chars.extend(range(65535, sys.maxunicode))
        chars.reverse()  # pop starting with ascii characters

        char_values = [(pk, "".join([chr(chars.pop()) for _ in range(2)]))
                       for pk in range(16)]
        char_values.extend([
            (16, None),
        ])
        char_schema = {
            'type': 'object',
            'selected': True,
            'properties': {
                'char_2': {
                    'type': ['string', 'null'],
                    'maxLength': 2,
                    'inclusion': 'available',
                    'selected': True
                },
                # 'minLength': 2},
                'pk': {
                    'maximum': 2147483647,
                    'type': ['integer'],
                    'inclusion': 'automatic',
                    'selected': True,
                    'minimum': -2147483648
                }
            }
        }

        varchar_values = [(pk, chr(chars.pop()), "".join([
            chr(chars.pop()) for _ in range(15)
        ]), "".join([chr(chars.pop()) for _ in range(randint(1, 16))]))
                          for pk in range(3)]
        varchar_values.extend([
            (3, None, None, None),
        ])
        varchar_schema = {
            'type': 'object',
            'selected': True,
            'properties': {
                'pk': {
                    'maximum': 2147483647,
                    'type': ['integer'],
                    'inclusion': 'automatic',
                    'selected': True,
                    'minimum': -2147483648
                },
                'varchar_8000': {
                    'type': ['string', 'null'],
                    'maxLength': 8000,
                    'inclusion': 'available',
                    'selected': True
                },
                # 'minLength': 0},
                'varchar_5': {
                    'type': ['string', 'null'],
                    'maxLength': 5,
                    'inclusion': 'available',
                    'selected': True
                },
                # 'minLength': 0},
                'varchar_max': {
                    'type': ['string', 'null'],
                    'maxLength': 2147483647,
                    'inclusion': 'available',
                    'selected': True
                }
            }
        }
        # 'minLength': 0}}}

        chars.reverse()
        nchar_values = [(pk, "".join([chr(chars.pop()) for _ in range(4)]))
                        for pk in range(3)]
        #  expect that values are right padded with spaces in the db.
        nchar_values = [
            (x, "{}{}".format(y,
                              " " * ((16 - len(y.encode('utf-16-le'))) // 2)))
            for x, y in nchar_values
        ]
        nchar_values.extend([
            (3, None),
        ])
        nchar_schema = {
            'type': 'object',
            'selected': True,
            'properties': {
                'nchar_8': {
                    'type': ['string', 'null'],
                    'maxLength': 8,
                    'inclusion': 'available',
                    'selected': True
                },
                # 'minLength': 8},  # length is based on bytes, not characters
                'pk': {
                    'maximum': 2147483647,
                    'type': ['integer'],
                    'inclusion': 'automatic',
                    'selected': True,
                    'minimum': -2147483648
                }
            }
        }

        nvarchar_values = [(pk, chr(chars.pop()), "".join([
            chr(chars.pop()) for _ in range(8)
        ]), "".join([chr(chars.pop()) for _ in range(randint(1, 8))]))
                           for pk in range(50)]
        nvarchar_values.extend([
            (50, None, None, None),
        ])

        pk = 51
        while len(chars):
            #  Use the rest of the characters
            nvarchar_values.extend([(pk, chr(chars.pop()), "".join([
                chr(chars.pop()) for _ in range(min(len(chars), 800))
            ]) if len(chars) else "", "".join([
                chr(chars.pop())
                for _ in range(min(len(chars), randint(1, 800)))
            ]) if len(chars) else "")])
            pk += 1

        nvarchar_schema = {
            'type': 'object',
            'selected': True,
            'properties': {
                'nvarchar_max': {
                    'type': ['string', 'null'],
                    'maxLength': 2147483647,
                    'inclusion': 'available',
                    'selected': True
                },
                # 'minLength': 0},
                'pk': {
                    'maximum': 2147483647,
                    'type': ['integer'],
                    'inclusion': 'automatic',
                    'selected': True,
                    'minimum': -2147483648
                },
                'nvarchar_4000': {
                    'type': ['string', 'null'],
                    'maxLength': 4000,
                    'inclusion': 'available',
                    'selected': True
                },
                # 'minLength': 0},
                'nvarchar_5': {
                    'type': ['string', 'null'],
                    'maxLength': 5,
                    'inclusion': 'available',
                    'selected': True
                }
            }
        }
        # 'minLength': 0}}}

        cls.EXPECTED_METADATA = {
            'data_types_database_dbo_char_data': {
                'is-view':
                False,
                'schema-name':
                schema_name,
                'row-count':
                0,
                'values':
                char_values,
                'table-key-properties': {'pk'},
                'selected':
                None,
                'database-name':
                database_name,
                'stream_name':
                'char_data',
                'fields': [{
                    'pk': {
                        'sql-datatype': 'int',
                        'selected-by-default': True,
                        'inclusion': 'automatic'
                    }
                }, {
                    'char_2': {
                        'sql-datatype': 'char',
                        'selected-by-default': True,
                        'inclusion': 'available'
                    }
                }],
                'schema':
                char_schema
            },
            'data_types_database_dbo_varchar_data': {
                'is-view':
                False,
                'schema-name':
                schema_name,
                'row-count':
                0,
                'values':
                varchar_values,
                'table-key-properties': {'pk'},
                'selected':
                None,
                'database-name':
                database_name,
                'stream_name':
                'varchar_data',
                'fields': [{
                    'pk': {
                        'sql-datatype': 'int',
                        'selected-by-default': True,
                        'inclusion': 'automatic'
                    }
                }, {
                    'varchar_5': {
                        'sql-datatype': 'varchar',
                        'selected-by-default': True,
                        'inclusion': 'available'
                    }
                }, {
                    'varchar_8000': {
                        'sql-datatype': 'varchar',
                        'selected-by-default': True,
                        'inclusion': 'available'
                    }
                }, {
                    'varchar_max': {
                        'sql-datatype': 'varchar',
                        'selected-by-default': True,
                        'inclusion': 'available'
                    }
                }],
                'schema':
                varchar_schema
            },
            'data_types_database_dbo_nchar_data': {
                'is-view':
                False,
                'schema-name':
                schema_name,
                'row-count':
                0,
                'values':
                nchar_values,
                'table-key-properties': {'pk'},
                'selected':
                None,
                'database-name':
                database_name,
                'stream_name':
                'nchar_data',
                'fields': [{
                    'pk': {
                        'sql-datatype': 'int',
                        'selected-by-default': True,
                        'inclusion': 'automatic'
                    }
                }, {
                    'nchar_8': {
                        'sql-datatype': 'nchar',
                        'selected-by-default': True,
                        'inclusion': 'available'
                    }
                }],
                'schema':
                nchar_schema
            },
            'data_types_database_dbo_nvarchar_data': {
                'is-view':
                False,
                'schema-name':
                schema_name,
                'row-count':
                0,
                'values':
                nvarchar_values,
                'table-key-properties': {'pk'},
                'selected':
                None,
                'database-name':
                database_name,
                'stream_name':
                'nvarchar_data',
                'fields': [{
                    'pk': {
                        'sql-datatype': 'int',
                        'selected-by-default': True,
                        'inclusion': 'automatic'
                    }
                }, {
                    'nvarchar_5': {
                        'sql-datatype': 'nvarchar',
                        'selected-by-default': True,
                        'inclusion': 'available'
                    }
                }, {
                    'nvarchar_4000': {
                        'sql-datatype': 'nvarchar',
                        'selected-by-default': True,
                        'inclusion': 'available'
                    }
                }, {
                    'nvarchar_max': {
                        'sql-datatype': 'nvarchar',
                        'selected-by-default': True,
                        'inclusion': 'available'
                    }
                }],
                'schema':
                nvarchar_schema
            },
        }
        query_list = list(
            create_database(database_name, "Latin1_General_CS_AS"))

        table_name = "char_data"
        column_name = ["pk", "char_2"]  # , "char_8000"]
        column_type = ["int", "char(2)"]  # , "char(8000)"]
        primary_key = {"pk"}
        column_def = [" ".join(x) for x in list(zip(column_name, column_type))]
        query_list.extend(
            create_table(database_name,
                         schema_name,
                         table_name,
                         column_def,
                         primary_key=primary_key))
        query_list.extend(
            insert(database_name, schema_name, table_name, char_values))

        table_name = "varchar_data"
        column_name = ["pk", "varchar_5", "varchar_8000", "varchar_max"]
        column_type = ["int", "varchar(5)", "varchar(8000)", "varchar(max)"]
        primary_key = {"pk"}
        column_def = [" ".join(x) for x in list(zip(column_name, column_type))]
        query_list.extend(
            create_table(database_name,
                         schema_name,
                         table_name,
                         column_def,
                         primary_key=primary_key))
        query_list.extend(
            insert(database_name, schema_name, table_name, varchar_values))

        table_name = "nchar_data"
        column_name = ["pk", "nchar_8"]  # , "nchar_4000"]
        column_type = ["int", "nchar(8)"]  # , "nchar(4000)"]
        primary_key = {"pk"}
        column_def = [" ".join(x) for x in list(zip(column_name, column_type))]
        query_list.extend(
            create_table(database_name,
                         schema_name,
                         table_name,
                         column_def,
                         primary_key=primary_key))
        # strip padding off query data
        nchar_query_values = [(x, y.rstrip() if isinstance(y, str) else y)
                              for x, y in nchar_values]
        query_list.extend(
            insert(database_name, schema_name, table_name, nchar_query_values))

        table_name = "nvarchar_data"
        column_name = ["pk", "nvarchar_5", "nvarchar_4000", "nvarchar_max"]
        column_type = ["int", "nvarchar(5)", "nvarchar(4000)", "nvarchar(max)"]
        primary_key = {"pk"}
        column_def = [" ".join(x) for x in list(zip(column_name, column_type))]
        query_list.extend(
            create_table(database_name,
                         schema_name,
                         table_name,
                         column_def,
                         primary_key=primary_key))

        query_list.extend(
            insert(database_name, schema_name, table_name, nvarchar_values))
        query_list.extend(
            ['-- there are {} characters left to test'.format(len(chars))])

        cls.expected_metadata = cls.discovery_expected_metadata

        mssql_cursor_context_manager(*query_list)
예제 #4
0
    def setUpClass(cls) -> None:
        """Create the expected schema in the test database"""
        drop_all_user_databases()
        database_name = "data_types_database"
        schema_name = "dbo"

        values = [
            (0, 1.1754944e-38, 2.2250738585072014e-308, 1.1754944e-38),
            (1, 3.4028235e+38, 1.7976931348623157e+308, 3.4028235e+38),
            (2, -1.1754944e-38, -2.2250738585072014e-308, -1.1754944e-38),
            (3, -3.4028235e+38, -1.7976931348623157e+308, -3.4028235e+38),
            (4, 0.0, 0.0, 0.0), (5, None, None, None),
            (6, 7.830105e-33, 6.46504535047369e-271, 4.0229383e-27),
            (7, 4.4540307e-21, 7.205251086772512e-202, 7.196247e-19),
            (8, 647852.6, 2.1597057137884757e+40, 8.430207e+34),
            (9, 3603.407, 8.811948588549982e+23, 9.1771755e+35),
            (10, -8.451405e-24, -1.783306877438393e-178, -2.2775854e-31),
            (11, -5.8271772e-27, -9.344274532947989e-227, -3.5728205e-18),
            (12, -8.519153e+23, -2.3035944912603858e+241, -5.7120217e+35),
            (13, -30306750.0, -5.222263032559684e+106, -1.9535917e+27)
        ]

        schema = {
            'selected': True,
            'type': 'object',
            'properties': {
                'float_24': {
                    'selected': True,
                    'type': ['number', 'null'],
                    'inclusion': 'available'
                },
                'float_53': {
                    'selected': True,
                    'type': ['number', 'null'],
                    'inclusion': 'available'
                },
                'real_24_bits': {
                    'selected': True,
                    'type': ['number', 'null'],
                    'inclusion': 'available'
                },
                'pk': {
                    'selected': True,
                    'type': ['integer'],
                    'maximum': 2147483647,
                    'minimum': -2147483648,
                    'inclusion': 'automatic'
                }
            }
        }

        cls.EXPECTED_METADATA = {
            'data_types_database_dbo_float_precisions': {
                'is-view':
                False,
                'schema-name':
                schema_name,
                'row-count':
                0,
                'values':
                values,
                'table-key-properties': {'pk'},
                'selected':
                None,
                'database-name':
                database_name,
                'stream_name':
                'float_precisions',
                'fields': [{
                    'pk': {
                        'sql-datatype': 'int',
                        'selected-by-default': True,
                        'inclusion': 'automatic'
                    }
                }, {
                    'float_24': {
                        'sql-datatype': 'real',
                        'selected-by-default': True,
                        'inclusion': 'available'
                    }
                }, {
                    'float_53': {
                        'sql-datatype': 'float',
                        'selected-by-default': True,
                        'inclusion': 'available'
                    }
                }, {
                    'real_24_bits': {
                        'sql-datatype': 'real',
                        'selected-by-default': True,
                        'inclusion': 'available'
                    }
                }],
                'schema':
                schema
            }
        }

        query_list = list(
            create_database(database_name, "Latin1_General_CS_AS"))
        # query_list.extend(create_schema(database_name, schema_name))

        table_name = "float_precisions"
        column_name = ["pk", "float_24", "float_53", "real_24_bits"]
        column_type = ["int", "float(24)", "float(53)", "real"]
        primary_key = {"pk"}
        column_def = [" ".join(x) for x in list(zip(column_name, column_type))]
        query_list.extend(
            create_table(database_name,
                         schema_name,
                         table_name,
                         column_def,
                         primary_key=primary_key))
        query_list.extend(
            insert(
                database_name, schema_name, table_name, cls.EXPECTED_METADATA[
                    "data_types_database_dbo_float_precisions"]["values"]))

        mssql_cursor_context_manager(*query_list)

        cls.expected_metadata = cls.discovery_expected_metadata
예제 #5
0
    def test_run(self):
        """
        Verify that a full sync can send capture all data and send it in the correct format
        for integer and boolean (bit) data.
        Verify that the fist sync sends an activate immediately.
        Verify that the table version is incremented up
        """
        print("running test {}".format(self.name()))

        conn_id = self.create_connection()

        # run in check mode
        check_job_name = runner.run_check_mode(self, conn_id)

        # verify check  exit codes
        exit_status = menagerie.get_exit_status(conn_id, check_job_name)
        menagerie.verify_check_exit_status(self, exit_status, check_job_name)

        # get the catalog information of discovery
        found_catalogs = menagerie.get_catalogs(conn_id)
        additional_md = [{
            "breadcrumb": [],
            "metadata": {
                'replication-method': 'LOG_BASED'
            }
        }]
        BaseTapTest.select_all_streams_and_fields(conn_id,
                                                  found_catalogs,
                                                  additional_md=additional_md)

        # clear state
        menagerie.set_state(conn_id, {})
        sync_job_name = runner.run_sync_mode(self, conn_id)

        # verify tap and target exit codes
        exit_status = menagerie.get_exit_status(conn_id, sync_job_name)
        menagerie.verify_sync_exit_status(self, exit_status, sync_job_name)

        # verify record counts of streams
        record_count_by_stream = runner.examine_target_output_file(
            self, conn_id, self.expected_streams(),
            self.expected_primary_keys_by_stream_id())
        expected_count = {
            k: len(v['values'])
            for k, v in self.expected_metadata().items()
        }
        # self.assertEqual(record_count_by_stream, expected_count)

        # verify records match on the first sync
        records_by_stream = runner.get_records_from_target_output()

        table_version = dict()
        for stream in self.expected_streams():
            with self.subTest(stream=stream):
                stream_expected_data = self.expected_metadata()[stream]
                table_version[stream] = records_by_stream[stream][
                    'table_version']

                # verify on the first sync you get
                # activate version message before and after all data for the full table
                # and before the logical replication part
                if records_by_stream[stream]['messages'][-1].get("data"):
                    last_row_data = True
                else:
                    last_row_data = False

                self.assertEqual(
                    records_by_stream[stream]['messages'][0]['action'],
                    'activate_version')
                self.assertEqual(
                    records_by_stream[stream]['messages'][-2]['action'],
                    'activate_version')
                if last_row_data:
                    self.assertEqual(
                        records_by_stream[stream]['messages'][-3]['action'],
                        'activate_version')
                else:
                    self.assertEqual(
                        records_by_stream[stream]['messages'][-1]['action'],
                        'activate_version')
                self.assertEqual(
                    len([
                        m for m in records_by_stream[stream]['messages'][1:]
                        if m["action"] == "activate_version"
                    ]),
                    2,
                    msg=
                    "Expect 2 more activate version messages for end of full table and beginning of log based"
                )

                column_names = [
                    list(field_data.keys())[0]
                    for field_data in stream_expected_data[self.FIELDS]
                ]

                expected_messages = [{
                    "action": "upsert",
                    "data": {
                        column: value
                        for column, value in list(
                            zip(column_names, stream_expected_data[self.VALUES]
                                [row]))
                    }
                } for row in range(len(stream_expected_data[self.VALUES]))]

                # Verify all data is correct for the full table part
                if last_row_data:
                    final_row = -3
                else:
                    final_row = -2

                for expected_row, actual_row in list(
                        zip(expected_messages, records_by_stream[stream]
                            ['messages'][1:final_row])):
                    with self.subTest(expected_row=expected_row):

                        self.assertEqual(actual_row["action"], "upsert")
                        self.assertEqual(
                            len(expected_row["data"].keys()),
                            len(actual_row["data"].keys()),
                            msg="there are not the same number of columns")
                        for column_name, expected_value in expected_row[
                                "data"].items():
                            if isinstance(expected_value, Decimal):
                                self.assertEqual(
                                    type(actual_row["data"][column_name]),
                                    Decimal,
                                    msg=
                                    "decimal value is not represented as a number"
                                )
                                self.assertEqual(
                                    expected_value,
                                    actual_row["data"][column_name],
                                    msg="expected: {} != actual {}".format(
                                        expected_row, actual_row))
                            else:
                                self.assertEqual(
                                    expected_value,
                                    actual_row["data"][column_name],
                                    msg="expected: {} != actual {}".format(
                                        expected_row, actual_row))

                # Verify all data is correct for the log replication part if sent
                if records_by_stream[stream]['messages'][-1].get("data"):
                    for column_name, expected_value in expected_messages[-1][
                            "data"].items():
                        self.assertEqual(
                            expected_value,
                            records_by_stream[stream]['messages'][-1]["data"]
                            [column_name],
                            msg="expected: {} != actual {}".format(
                                expected_row, actual_row))

                print("records are correct for stream {}".format(stream))

                # verify state and bookmarks
                state = menagerie.get_state(conn_id)
                bookmark = state['bookmarks'][stream]

                self.assertIsNone(
                    state.get('currently_syncing'),
                    msg="expected state's currently_syncing to be None")
                self.assertIsNotNone(
                    bookmark.get('current_log_version'),
                    msg=
                    "expected bookmark to have current_log_version because we are using log replication"
                )
                self.assertTrue(bookmark['initial_full_table_complete'],
                                msg="expected full table to be complete")
                inital_log_version = bookmark['current_log_version']

                self.assertEqual(
                    bookmark['version'],
                    table_version[stream],
                    msg="expected bookmark for stream to match version")

                expected_schemas = self.expected_metadata()[stream]['schema']
                self.assertEqual(
                    records_by_stream[stream]['schema'],
                    simplejson.loads(simplejson.dumps(expected_schemas),
                                     use_decimal=True),
                    msg="expected: {} != actual: {}".format(
                        expected_schemas, records_by_stream[stream]['schema']))

        # ----------------------------------------------------------------------
        # invoke the sync job AGAIN and after insert, update, delete or rows
        # ----------------------------------------------------------------------

        database_name = "data_types_database"
        schema_name = "dbo"
        table_name = "decimal_precisions"
        precision_scale = DECIMAL_PRECISION_SCALE
        column_type = [
            "decimal({},{})".format(precision, scale)
            for precision, scale in precision_scale
        ]
        column_name = ["pk"] + [
            x.replace("(", "_").replace(",", "_").replace(")", "")
            for x in column_type
        ]
        insert_value = [
            (7, Decimal('-92473.8401'), Decimal('-4182159664734.645653'),
             Decimal('6101329656084900380190.268036'),
             Decimal('4778017533841887320066645.9761464001349')),
        ]
        update_value = [
            (3, Decimal('-92473.8401'), Decimal('-4182159664734.645653'),
             Decimal('6101329656084900380190.268036'),
             Decimal('4778017533841887320066645.9761464001349')),
        ]
        delete_value = [(4, )]
        query_list = (insert(database_name, schema_name, table_name,
                             insert_value))
        query_list.extend(
            delete_by_pk(database_name, schema_name, table_name, delete_value,
                         column_name[:1]))
        query_list.extend(
            update_by_pk(database_name, schema_name, table_name, update_value,
                         column_name))
        mssql_cursor_context_manager(*query_list)
        insert_value = [insert_value[0] + (None, )]
        update_value = [update_value[0] + (None, )]
        delete_value = [
            delete_value[0] + (None, None, None, None, datetime.utcnow())
        ]
        self.EXPECTED_METADATA["data_types_database_dbo_decimal_precisions"]["values"] = \
            [self.expected_metadata()["data_types_database_dbo_decimal_precisions"]["values"][-1]] + \
            insert_value + delete_value + update_value
        self.EXPECTED_METADATA["data_types_database_dbo_decimal_precisions"][
            "fields"].append({
                "_sdc_deleted_at": {
                    'sql-datatype': 'datetime',
                    'selected-by-default': True,
                    'inclusion': 'automatic'
                }
            })

        database_name = "data_types_database"
        schema_name = "dbo"
        table_name = "numeric_precisions"
        precision_scale = NUMERIC_PRECISION_SCALE
        column_type = [
            "numeric({},{})".format(precision, scale)
            for precision, scale in precision_scale
        ]
        column_name = ["pk"] + [
            x.replace("(", "_").replace(",", "_").replace(")", "")
            for x in column_type
        ]
        insert_value = [
            (7, Decimal('96701.9382'), Decimal('-4371716.186100650268'),
             Decimal('-367352.306093776232045517794'),
             Decimal('-81147872128956247517327931319278572.985')),
        ]
        update_value = [
            (3, Decimal('96701.9382'), Decimal('-4371716.186100650268'),
             Decimal('-367352.306093776232045517794'),
             Decimal('-81147872128956247517327931319278572.985')),
        ]
        delete_value = [(4, )]
        query_list = (insert(database_name, schema_name, table_name,
                             insert_value))
        query_list.extend(
            delete_by_pk(database_name, schema_name, table_name, delete_value,
                         column_name[:1]))
        query_list.extend(
            update_by_pk(database_name, schema_name, table_name, update_value,
                         column_name))
        mssql_cursor_context_manager(*query_list)
        insert_value = [insert_value[0] + (None, )]
        update_value = [update_value[0] + (None, )]
        delete_value = [
            delete_value[0] + (None, None, None, None, datetime.utcnow())
        ]
        self.EXPECTED_METADATA["data_types_database_dbo_numeric_precisions"]["values"] = \
            insert_value + delete_value + update_value
        self.EXPECTED_METADATA["data_types_database_dbo_numeric_precisions"][
            "fields"].append({
                "_sdc_deleted_at": {
                    'sql-datatype': 'datetime',
                    'selected-by-default': True,
                    'inclusion': 'automatic'
                }
            })

        sync_job_name = runner.run_sync_mode(self, conn_id)

        # verify tap and target exit codes
        exit_status = menagerie.get_exit_status(conn_id, sync_job_name)
        menagerie.verify_sync_exit_status(self, exit_status, sync_job_name)
        record_count_by_stream = runner.examine_target_output_file(
            self, conn_id, self.expected_streams(),
            self.expected_primary_keys_by_stream_id())
        expected_count = {
            k: len(v['values'])
            for k, v in self.expected_metadata().items()
        }
        self.assertEqual(record_count_by_stream, expected_count)
        records_by_stream = runner.get_records_from_target_output()

        for stream in self.expected_streams():
            with self.subTest(stream=stream):
                stream_expected_data = self.expected_metadata()[stream]
                new_table_version = records_by_stream[stream]['table_version']

                # verify on a subsequent sync you get activate version message only after all data
                self.assertEqual(
                    records_by_stream[stream]['messages'][0]['action'],
                    'activate_version')
                self.assertTrue(
                    all([
                        message["action"] == "upsert" for message in
                        records_by_stream[stream]['messages'][1:]
                    ]))

                column_names = [
                    list(field_data.keys())[0]
                    for field_data in stream_expected_data[self.FIELDS]
                ]

                expected_messages = [{
                    "action": "upsert",
                    "data": {
                        column: value
                        for column, value in list(
                            zip(column_names, stream_expected_data[self.VALUES]
                                [row]))
                    }
                } for row in range(len(stream_expected_data[self.VALUES]))]

                # remove sequences from actual values for comparison
                [
                    message.pop("sequence")
                    for message in records_by_stream[stream]['messages'][1:]
                ]

                # Verify all data is correct
                for expected_row, actual_row in list(
                        zip(expected_messages,
                            records_by_stream[stream]['messages'][1:])):
                    with self.subTest(expected_row=expected_row):
                        self.assertEqual(actual_row["action"], "upsert")

                        # we only send the _sdc_deleted_at column for deleted rows
                        self.assertGreaterEqual(
                            len(expected_row["data"].keys()),
                            len(actual_row["data"].keys()),
                            msg="there are not the same number of columns")

                        for column_name, expected_value in expected_row[
                                "data"].items():
                            if column_name != "_sdc_deleted_at":
                                if isinstance(expected_value, Decimal):
                                    self.assertEqual(
                                        type(actual_row["data"][column_name]),
                                        Decimal,
                                        msg=
                                        "decimal value is not represented as a number"
                                    )
                                    self.assertEqual(
                                        expected_value,
                                        actual_row["data"][column_name],
                                        msg="expected: {} != actual {}".format(
                                            expected_row, actual_row))
                                else:
                                    self.assertEqual(
                                        expected_value,
                                        actual_row["data"][column_name],
                                        msg="expected: {} != actual {}".format(
                                            expected_row, actual_row))
                            elif expected_value:
                                # we have an expected value for a deleted row
                                try:
                                    actual_value = datetime.strptime(
                                        actual_row["data"][column_name],
                                        "%Y-%m-%dT%H:%M:%S.%fZ")
                                except ValueError:
                                    actual_value = datetime.strptime(
                                        actual_row["data"][column_name],
                                        "%Y-%m-%dT%H:%M:%SZ")
                                self.assertGreaterEqual(
                                    actual_value,
                                    expected_value - timedelta(seconds=15))
                                self.assertLessEqual(
                                    actual_value,
                                    expected_value + timedelta(seconds=15))
                            else:
                                # the row wasn't deleted so we can either not pass the column or it can be None
                                self.assertIsNone(
                                    actual_row["data"].get(column_name))

                print("records are correct for stream {}".format(stream))

                # verify state and bookmarks
                state = menagerie.get_state(conn_id)
                bookmark = state['bookmarks'][stream]

                self.assertIsNone(
                    state.get('currently_syncing'),
                    msg="expected state's currently_syncing to be None")
                self.assertIsNotNone(
                    bookmark.get('current_log_version'),
                    msg=
                    "expected bookmark to have current_log_version because we are using log replication"
                )
                self.assertTrue(bookmark['initial_full_table_complete'],
                                msg="expected full table to be complete")
                new_log_version = bookmark['current_log_version']
                self.assertGreater(new_log_version,
                                   inital_log_version,
                                   msg='expected log version to increase')

                self.assertEqual(
                    bookmark['version'],
                    table_version[stream],
                    msg="expected bookmark for stream to match version")
                self.assertEqual(
                    bookmark['version'],
                    new_table_version,
                    msg="expected bookmark for stream to match version")

                expected_schemas = self.expected_metadata()[stream]['schema']
                self.assertEqual(
                    records_by_stream[stream]['schema'],
                    simplejson.loads(simplejson.dumps(expected_schemas),
                                     use_decimal=True),
                    msg="expected: {} != actual: {}".format(
                        expected_schemas, records_by_stream[stream]['schema']))
예제 #6
0
    def test_run(self):
        """stream_expected_data[self.VALUES]
        Verify that a full sync can send capture all data and send it in the correct format
        for integer and boolean (bit) data.
        Verify that the fist sync sends an activate immediately.
        Verify that the table version is incremented up
        """
        print("running test {}".format(self.name()))

        conn_id = self.create_connection()

        # run in check mode
        check_job_name = runner.run_check_mode(self, conn_id)

        # verify check  exit codes
        exit_status = menagerie.get_exit_status(conn_id, check_job_name)
        menagerie.verify_check_exit_status(self, exit_status, check_job_name)

        # get the catalog information of discovery
        found_catalogs = menagerie.get_catalogs(conn_id)
        # TODO - change the replication key back to replication_key_column when rowversion is supported
        additional_md = [{
            "breadcrumb": [],
            "metadata": {
                'replication-method': 'INCREMENTAL',
                'replication-key': 'temp_replication_key_column'
            }
        }]

        non_selected_properties = [
            "nvarchar_text", "varchar_text", "varbinary_data", "geospacial",
            "geospacial_map", "markup", "tree", "variant",
            "SpecialPurposeColumns", "started_at", "ended_at"
        ]
        BaseTapTest.select_all_streams_and_fields(
            conn_id,
            found_catalogs,
            non_selected_properties=non_selected_properties,
            additional_md=additional_md)

        # clear state
        menagerie.set_state(conn_id, {})
        sync_job_name = runner.run_sync_mode(self, conn_id)

        # verify tap and target exit codes
        exit_status = menagerie.get_exit_status(conn_id, sync_job_name)
        menagerie.verify_sync_exit_status(self, exit_status, sync_job_name)

        # verify record counts of streams
        record_count_by_stream = runner.examine_target_output_file(
            self, conn_id, self.expected_streams(),
            self.expected_primary_keys_by_stream_id())
        expected_count = {
            k: len(v['values'])
            for k, v in self.expected_metadata().items()
        }
        self.assertEqual(record_count_by_stream, expected_count)

        # verify records match on the first sync
        records_by_stream = runner.get_records_from_target_output()

        table_version = dict()
        for stream in self.expected_streams():
            with self.subTest(stream=stream):
                stream_expected_data = self.expected_metadata()[stream]
                table_version[stream] = records_by_stream[stream][
                    'table_version']

                # verify on the first sync you get
                # activate version message before and after all data for the full table
                # and before the logical replication part
                self.assertEqual(
                    records_by_stream[stream]['messages'][0]['action'],
                    'activate_version')
                self.assertEqual(
                    records_by_stream[stream]['messages'][-1]['action'],
                    'activate_version')
                self.assertTrue(
                    all([
                        m["action"] == "upsert"
                        for m in records_by_stream[stream]['messages'][1:-1]
                    ]),
                    msg="Expect all but the first message to be upserts")
                self.assertEqual(len(
                    records_by_stream[stream]['messages'][1:-1]),
                                 len(stream_expected_data[self.VALUES]),
                                 msg="incorrect number of upserts")

                column_names = [
                    list(field_data.keys())[0]
                    for field_data in stream_expected_data[self.FIELDS]
                ]

                expected_messages = [
                    {
                        "action": "upsert",
                        "data": {
                            column: value
                            for column, value in list(
                                zip(column_names, row_values))
                            if column not in non_selected_properties
                        }  # TODO - change to -1 for using rowversion for replication key
                    } for row_values in sorted(
                        stream_expected_data[self.VALUES],
                        key=lambda row: (row[1] is not None, row[1]))
                ]

                # Verify all data is correct for incremental
                for expected_row, actual_row in zip(
                        expected_messages,
                        records_by_stream[stream]['messages'][1:-1]):
                    with self.subTest(expected_row=expected_row):
                        self.assertEqual(actual_row["action"], "upsert")
                        self.assertEqual(
                            len(expected_row["data"].keys()),
                            len(actual_row["data"].keys()),
                            msg="there are not the same number of columns")
                        for column_name, expected_value in expected_row[
                                "data"].items():
                            if isinstance(expected_value, datetime):
                                # sql server only keeps milliseconds not microseconds
                                self.assertEqual(
                                    expected_value.isoformat().replace(
                                        '000+00:00',
                                        'Z').replace('+00:00', 'Z'),
                                    actual_row["data"][column_name],
                                    msg="expected: {} != actual {}".format(
                                        expected_value.isoformat().replace(
                                            '000+00:00',
                                            'Z').replace('+00:00', 'Z'),
                                        actual_row["data"][column_name]))
                            else:
                                self.assertEqual(
                                    expected_value,
                                    actual_row["data"][column_name],
                                    msg="expected: {} != actual {}".format(
                                        expected_row, actual_row))
                print("records are correct for stream {}".format(stream))

                # verify state and bookmarks
                state = menagerie.get_state(conn_id)
                bookmark = state['bookmarks'][stream]

                self.assertIsNone(
                    state.get('currently_syncing'),
                    msg="expected state's currently_syncing to be None")
                self.assertIsNone(bookmark.get('current_log_version'),
                                  msg="no log_version for incremental")
                self.assertIsNone(bookmark.get('initial_full_table_complete'),
                                  msg="no full table for incremental")
                # find the max value of the replication key TODO - change to -1 for using rowversion for replication key
                self.assertEqual(
                    bookmark['replication_key_value'],
                    re.sub(
                        r'\d{3}Z', "Z",
                        max([
                            row[1] for row in stream_expected_data[self.VALUES]
                        ]).strftime("%Y-%m-%dT%H:%M:%S.%fZ")))
                # self.assertEqual(bookmark['replication_key'], 'replication_key_value')

                self.assertEqual(
                    bookmark['version'],
                    table_version[stream],
                    msg="expected bookmark for stream to match version")

                expected_schemas = self.expected_metadata()[stream]['schema']
                self.assertEqual(records_by_stream[stream]['schema'],
                                 expected_schemas,
                                 msg="expected: {} != actual: {}".format(
                                     expected_schemas,
                                     records_by_stream[stream]['schema']))

        # ----------------------------------------------------------------------
        # invoke the sync job AGAIN and after insert, update, delete or rows
        # ----------------------------------------------------------------------

        database_name = "data_types_database"
        schema_name = "dbo"
        table_name = "text_and_image_deprecated_soon"
        column_name = [
            "pk", "temp_replication_key_column", "nvarchar_text",
            "varchar_text", "varbinary_data", "replication_key_column"
        ]
        insert_value = [(3,
                         datetime(2018,
                                  12,
                                  31,
                                  23,
                                  59,
                                  59,
                                  993000,
                                  tzinfo=timezone.utc), "JKL", "MNO",
                         "PQR".encode('utf-8'))]
        update_value = [(0,
                         datetime(2018,
                                  12,
                                  31,
                                  23,
                                  59,
                                  59,
                                  997000,
                                  tzinfo=timezone.utc), "JKL", "MNO",
                         "PQR".encode('utf-8'))]
        query_list = (insert(database_name,
                             schema_name,
                             table_name,
                             insert_value,
                             column_names=column_name[:-1]))
        query_list.extend(
            update_by_pk(database_name, schema_name, table_name, update_value,
                         column_name))
        mssql_cursor_context_manager(*query_list)

        values = insert_value + [(
            1, datetime(2018, 12, 31, 23, 59, 59, 987000, tzinfo=timezone.utc),
            "abc", "def", "ghi".encode('utf-8'))] + update_value
        rows = mssql_cursor_context_manager(*[
            "select replication_key_column from data_types_database.dbo.text_and_image_deprecated_soon "
            "where pk in (0, 1,3) order by pk desc"
        ])
        rows = [tuple(row) for row in rows]
        rows = [("0x{}".format(value.hex().upper()), ) for value, in rows]
        row_with_version = [x[0] + x[1] for x in zip(values, rows)]
        self.EXPECTED_METADATA[
            'data_types_database_dbo_text_and_image_deprecated_soon'][
                'values'] = row_with_version

        database_name = "data_types_database"
        schema_name = "dbo"
        table_name = "weirdos"
        column_name = [
            "pk", "temp_replication_key_column", "geospacial",
            "geospacial_map", "markup", "guid", "tree", "variant",
            "SpecialPurposeColumns", "replication_key_column"
        ]
        insert_value = [(3,
                         datetime(9999,
                                  12,
                                  31,
                                  23,
                                  59,
                                  59,
                                  993000,
                                  tzinfo=timezone.utc), None, None, None,
                         str(uuid.uuid1()).upper(), None, None, None)]
        update_value = [(1,
                         datetime(9999,
                                  12,
                                  31,
                                  23,
                                  59,
                                  59,
                                  997000,
                                  tzinfo=timezone.utc), None, None, None,
                         str(uuid.uuid1()).upper(), None, None, None)]
        delete_value = [(0, )]
        query_list = (insert(database_name, schema_name, table_name,
                             insert_value, column_name[:-1]))
        query_list.extend(
            delete_by_pk(database_name, schema_name, table_name, delete_value,
                         column_name[:1]))
        query_list.extend(
            update_by_pk(database_name, schema_name, table_name, update_value,
                         column_name))
        mssql_cursor_context_manager(*query_list)

        values = insert_value + [
            (2, datetime(9999, 12, 31, 23, 59, 59, 990000,
                         tzinfo=timezone.utc), None, None, None,
             "B792681C-AEF4-11E9-8002-0800276BC1DF", None, None, None)
        ] + update_value
        rows = mssql_cursor_context_manager(*[
            "select replication_key_column from data_types_database.dbo.weirdos "
            "where pk in (1, 2, 3) order by pk desc"
        ])
        rows = [tuple(row) for row in rows]
        rows = [("0x{}".format(value.hex().upper()), ) for value, in rows]
        row_with_version = [x[0] + x[1] for x in zip(values, rows)]
        self.EXPECTED_METADATA['data_types_database_dbo_weirdos'][
            'values'] = row_with_version

        database_name = "data_types_database"
        schema_name = "dbo"
        table_name = "computed_columns"
        column_name = [
            "pk", "temp_replication_key_column", "started_at", "ended_at",
            "replication_key_column"
        ]
        insert_value = [(2,
                         datetime(9998,
                                  12,
                                  31,
                                  23,
                                  59,
                                  59,
                                  990000,
                                  tzinfo=timezone.utc),
                         datetime(1980, 5, 30, 16), datetime.now())]
        update_value = [(0,
                         datetime(9998,
                                  12,
                                  31,
                                  23,
                                  59,
                                  59,
                                  997000,
                                  tzinfo=timezone.utc), datetime(1942, 11, 30),
                         datetime(2017, 2, 12))]
        query_list = (insert(database_name, schema_name, table_name,
                             insert_value, column_name[:-1]))
        query_list.extend(
            update_by_pk(database_name, schema_name, table_name, update_value,
                         column_name))
        mssql_cursor_context_manager(*query_list)
        values = insert_value + [(
            1, datetime(9998, 12, 31, 23, 59, 59, 987000, tzinfo=timezone.utc),
            datetime(1970, 1, 1, 0), datetime.now())] + update_value
        rows = mssql_cursor_context_manager(*[
            "select replication_key_column from data_types_database.dbo.computed_columns "
            "where pk in (0, 1, 2) order by pk desc"
        ])
        rows = [tuple(row) for row in rows]
        row_with_duration = [x[0] + x[1] for x in zip(values, rows)]
        self.EXPECTED_METADATA['data_types_database_dbo_computed_columns'][
            'values'] = row_with_duration

        sync_job_name = runner.run_sync_mode(self, conn_id)

        # verify tap and target exit codes
        exit_status = menagerie.get_exit_status(conn_id, sync_job_name)
        menagerie.verify_sync_exit_status(self, exit_status, sync_job_name)
        record_count_by_stream = runner.examine_target_output_file(
            self, conn_id, self.expected_streams(),
            self.expected_primary_keys_by_stream_id())
        expected_count = {
            k: len(v['values'])
            for k, v in self.expected_metadata().items()
        }
        self.assertEqual(record_count_by_stream, expected_count)
        records_by_stream = runner.get_records_from_target_output()

        for stream in self.expected_streams():
            with self.subTest(stream=stream):
                stream_expected_data = self.expected_metadata()[stream]
                new_table_version = records_by_stream[stream]['table_version']

                # verify on a subsequent sync you get activate version message only after all data
                self.assertEqual(
                    records_by_stream[stream]['messages'][0]['action'],
                    'activate_version')
                self.assertEqual(
                    records_by_stream[stream]['messages'][-1]['action'],
                    'activate_version')
                self.assertTrue(
                    all([
                        message["action"] == "upsert" for message in
                        records_by_stream[stream]['messages'][1:-1]
                    ]))
                self.assertEqual(len(
                    records_by_stream[stream]['messages'][1:-1]),
                                 len(stream_expected_data[self.VALUES]),
                                 msg="incorrect number of upserts")

                column_names = [
                    list(field_data.keys())[0]
                    for field_data in stream_expected_data[self.FIELDS]
                ]

                expected_messages = [{
                    "action": "upsert",
                    "data": {
                        column: value
                        for column, value in list(zip(column_names,
                                                      row_values))
                        if column not in non_selected_properties
                    }
                } for row_values in sorted(stream_expected_data[self.VALUES],
                                           key=lambda row:
                                           (row[1] is not None, row[1]))]

                # remove sequences from actual values for comparison
                [
                    message.pop("sequence")
                    for message in records_by_stream[stream]['messages'][1:-1]
                ]

                # Verify all data is correct
                for expected_row, actual_row in list(
                        zip(expected_messages,
                            records_by_stream[stream]['messages'][1:-1])):
                    with self.subTest(expected_row=expected_row):
                        self.assertEqual(actual_row["action"], "upsert")

                        # we only send the _sdc_deleted_at column for deleted rows
                        self.assertEqual(
                            len(expected_row["data"].keys()),
                            len(actual_row["data"].keys()),
                            msg="there are not the same number of columns")
                        for column_name, expected_value in expected_row[
                                "data"].items():
                            if isinstance(expected_value, datetime):
                                # sql server only keeps milliseconds not microseconds
                                self.assertEqual(
                                    expected_value.isoformat().replace(
                                        '000+00:00',
                                        'Z').replace('+00:00', 'Z'),
                                    actual_row["data"][column_name],
                                    msg="expected: {} != actual {}".format(
                                        expected_value.isoformat().replace(
                                            '000+00:00',
                                            'Z').replace('+00:00', 'Z'),
                                        actual_row["data"][column_name]))
                            else:
                                self.assertEqual(
                                    expected_value,
                                    actual_row["data"][column_name],
                                    msg="expected: {} != actual {}".format(
                                        expected_row, actual_row))
                        print(
                            "records are correct for stream {}".format(stream))

                # verify state and bookmarks
                state = menagerie.get_state(conn_id)
                bookmark = state['bookmarks'][stream]

                self.assertIsNone(
                    state.get('currently_syncing'),
                    msg="expected state's currently_syncing to be None")
                self.assertIsNone(bookmark.get('current_log_version'),
                                  msg="no log_version for incremental")
                self.assertIsNone(bookmark.get('initial_full_table_complete'),
                                  msg="no full table for incremental")
                # find the max value of the replication key
                self.assertEqual(
                    bookmark['replication_key_value'],
                    re.sub(
                        r'\d{3}Z', "Z",
                        max([
                            row[1] for row in stream_expected_data[self.VALUES]
                        ]).strftime("%Y-%m-%dT%H:%M:%S.%fZ")))
                # self.assertEqual(bookmark['replication_key'], 'replication_key_value')

                self.assertEqual(
                    bookmark['version'],
                    table_version[stream],
                    msg="expected bookmark for stream to match version")
                self.assertEqual(
                    bookmark['version'],
                    new_table_version,
                    msg="expected bookmark for stream to match version")

                state = menagerie.get_state(conn_id)
                bookmark = state['bookmarks'][stream]

                expected_schemas = self.expected_metadata()[stream]['schema']
                self.assertEqual(records_by_stream[stream]['schema'],
                                 expected_schemas,
                                 msg="expected: {} != actual: {}".format(
                                     expected_schemas,
                                     records_by_stream[stream]['schema']))
예제 #7
0
    def setUpClass(cls) -> None:
        """Create the expected schema in the test database"""
        drop_all_user_databases()
        database_name = "data_types_database"
        schema_name = "dbo"

        numeric_values = [
            (0, Decimal('-99999.9999'), Decimal('-9999999.999999999999'),
             Decimal('-999999.9999999999999999999999'),
             Decimal('-99999999999999999999999999999999999.999')),
            (1, 0, 0, 0, 0), (2, None, None, None, None),
            (3, Decimal('99999.9999'), Decimal('9999999.999999999999'),
             Decimal('999999.9999999999999999999999'),
             Decimal('99999999999999999999999999999999999.999')),
            (4, Decimal('96701.9382'), Decimal('-4371716.186100650268'),
             Decimal('-367352.306093776232045517794'),
             Decimal('-81147872128956247517327931319278572.985')),
            (5, Decimal('-73621.9366'), Decimal('2564047.277589545531'),
             Decimal('336177.4754683699464233786667'),
             Decimal('46946462608534127558389411015159825.758')),
            (6, Decimal('-3070.7339'), Decimal('6260062.158440967433'),
             Decimal('-987006.0035971607740533206418'),
             Decimal('95478671259010046866787754969592794.61'))
        ]
        numeric_precision_scale = [(9, 4), (19, 12), (28, 22), (38, 3)]

        # TODO - Remove this workaround when we fix decimal precision and to test
        # numeric_values = [
        #     (0, Decimal('-9999999.99'), Decimal('-9999999999999.99')),
        #     (1, 0, 0),
        #     (2, None, None),
        #     (3, Decimal('9999999.99'), Decimal('9999999999999.99')),
        #     (4, Decimal('-4133076.27'), Decimal('8499042653781.28')),
        #     (5, Decimal('-8629188.35'), Decimal('-4589639716080.97')),
        #     (6, Decimal('-9444926.01'), Decimal('7151189415270.4'))]
        # numeric_precision_scale = [(9, 2), (15, 2)]
        numeric_schema = {
            'type': 'object',
            'properties': {
                'numeric_9_4': {
                    'exclusiveMaximum': True,
                    'type': ['number', 'null'],
                    'selected': True,
                    'multipleOf': 0.0001,
                    'maximum': 1e5,
                    'inclusion': 'available',
                    'exclusiveMinimum': True,
                    'minimum': -1e5
                },
                'numeric_19_12': {
                    'exclusiveMaximum': True,
                    'type': ['number', 'null'],
                    'selected': True,
                    'multipleOf': 1e-12,
                    'maximum': 1e7,
                    'inclusion': 'available',
                    'exclusiveMinimum': True,
                    'minimum': -1e7
                },
                'numeric_28_22': {
                    'exclusiveMaximum': True,
                    'type': ['number', 'null'],
                    'selected': True,
                    'multipleOf': 1e-22,
                    'maximum': 1e6,
                    'inclusion': 'available',
                    'exclusiveMinimum': True,
                    'minimum': -1e6
                },
                'numeric_38_3': {
                    'exclusiveMaximum': True,
                    'type': ['number', 'null'],
                    'selected': True,
                    'multipleOf': .001,
                    'maximum': 1e35,
                    'inclusion': 'available',
                    'exclusiveMinimum': True,
                    'minimum': -1e35
                },
                'pk': {
                    'maximum': 2147483647,
                    'type': ['integer'],
                    'inclusion': 'automatic',
                    'minimum': -2147483648,
                    'selected': True
                }
            },
            'selected': True
        }
        # numeric_schema = {
        #     'type': 'object',
        #     'properties': {
        #         'numeric_9_2': {
        #             'exclusiveMaximum': True,
        #             'type': ['number', 'null'],
        #             'selected': True,
        #             'multipleOf': 0.01,
        #             'maximum': 10000000,
        #             'inclusion': 'available',
        #             'exclusiveMinimum': True,
        #             'minimum': -10000000},
        #         'numeric_15_2': {
        #             'exclusiveMaximum': True,
        #             'type': ['number', 'null'],
        #             'selected': True,
        #             'multipleOf': 0.01,
        #             'maximum': 10000000000000,
        #             'inclusion': 'available',
        #             'exclusiveMinimum': True,
        #             'minimum': -10000000000000},
        #         'pk': {
        #             'maximum': 2147483647,
        #             'type': ['integer'],
        #             'inclusion': 'automatic',
        #             'minimum': -2147483648,
        #             'selected': True}},
        #     'selected': True}

        decimal_values = [
            (0, Decimal('-99999.9999'), Decimal('-9999999999999.999999'),
             Decimal('-9999999999999999999999.999999'),
             Decimal('-9999999999999999999999999.9999999999999')),
            (1, 0, 0, 0, 0), (2, None, None, None, None),
            (3, Decimal('99999.9999'), Decimal('9999999999999.999999'),
             Decimal('9999999999999999999999.999999'),
             Decimal('9999999999999999999999999.9999999999999')),
            (4, Decimal('-92473.8401'), Decimal('-4182159664734.645653'),
             Decimal('6101329656084900380190.268036'),
             Decimal('4778017533841887320066645.9761464001349')),
            (5, Decimal('-57970.8157'), Decimal('7735958802279.086687'),
             Decimal('4848737828398517845540.057905'),
             Decimal('2176036096567853905237453.5152648989022')),
            (6, Decimal('57573.9037'), Decimal('5948502499261.181557'),
             Decimal('-6687721783088280707003.076638'),
             Decimal('-6264019242578746090842245.3746225058202'))
        ]
        decimal_precision_scale = [(9, 4), (19, 6), (28, 6), (38, 13)]

        # TODO - Remove this workaround when we fix decimal precision and to test
        # decimal_values = [
        #     (0, Decimal('-9999.99999'), Decimal('-999999999999.999')),
        #     (1, 0, 0),
        #     (2, None, None),
        #     (3, Decimal('9999.99999'), Decimal('999999999999.999')),
        #     (4, Decimal('7191.0647'), Decimal('284159490729.628')),
        #     (5, Decimal('6470.19405'), Decimal('-631069143780.173')),
        #     (6, Decimal('4708.67525'), Decimal('-570692336616.609'))]
        # decimal_precision_scale = [(9, 5), (15, 3)]
        decimal_schema = {
            'type': 'object',
            'properties': {
                'decimal_9_4': {
                    'exclusiveMaximum': True,
                    'type': ['number', 'null'],
                    'selected': True,
                    'multipleOf': 0.0001,
                    'maximum': 1e5,
                    'inclusion': 'available',
                    'exclusiveMinimum': True,
                    'minimum': -1e5
                },
                'decimal_19_6': {
                    'exclusiveMaximum': True,
                    'type': ['number', 'null'],
                    'selected': True,
                    'multipleOf': 1e-6,
                    'maximum': 1e13,
                    'inclusion': 'available',
                    'exclusiveMinimum': True,
                    'minimum': -1e13
                },
                'decimal_28_6': {
                    'exclusiveMaximum': True,
                    'type': ['number', 'null'],
                    'selected': True,
                    'multipleOf': 1e-6,
                    'maximum': 1e22,
                    'inclusion': 'available',
                    'exclusiveMinimum': True,
                    'minimum': -1e22
                },
                'decimal_38_13': {
                    'exclusiveMaximum': True,
                    'type': ['number', 'null'],
                    'selected': True,
                    'multipleOf': 1e-13,
                    'maximum': 1e25,
                    'inclusion': 'available',
                    'exclusiveMinimum': True,
                    'minimum': -1e25
                },
                'pk': {
                    'maximum': 2147483647,
                    'type': ['integer'],
                    'inclusion': 'automatic',
                    'minimum': -2147483648,
                    'selected': True
                }
            },
            'selected': True
        }
        # decimal_schema = {
        #     'type': 'object',
        #     'properties': {
        #         'decimal_15_3': {
        #             'exclusiveMaximum': True,
        #             'type': ['number', 'null'],
        #             'selected': True,
        #             'multipleOf': 0.001,
        #             'maximum': 1000000000000,
        #             'inclusion': 'available',
        #             'exclusiveMinimum': True,
        #             'minimum': -1000000000000},
        #         'decimal_9_5': {
        #             'exclusiveMaximum': True,
        #             'type': ['number', 'null'],
        #             'selected': True,
        #             'multipleOf': 1e-05,
        #             'maximum': 10000,
        #             'inclusion': 'available',
        #             'exclusiveMinimum': True, 'minimum': -10000},
        #         'pk': {
        #             'maximum': 2147483647,
        #             'type': ['integer'],
        #             'inclusion': 'automatic',
        #             'minimum': -2147483648,
        #             'selected': True}},
        #     'selected': True}

        cls.EXPECTED_METADATA = {
            'data_types_database_dbo_numeric_precisions': {
                'is-view':
                False,
                'schema-name':
                schema_name,
                'row-count':
                0,
                'values':
                numeric_values,
                'table-key-properties': {'pk'},
                'selected':
                None,
                'database-name':
                database_name,
                'stream_name':
                'numeric_precisions',
                'fields': [{
                    'pk': {
                        'sql-datatype': 'int',
                        'selected-by-default': True,
                        'inclusion': 'automatic'
                    }
                }, {
                    'numeric_9_4': {
                        'sql-datatype': 'numeric(9,4)',
                        'selected-by-default': True,
                        'inclusion': 'available'
                    }
                }, {
                    'numeric_19_12': {
                        'sql-datatype': 'numeric(19,12)',
                        'selected-by-default': True,
                        'inclusion': 'available'
                    }
                }, {
                    'numeric_28_22': {
                        'sql-datatype': 'numeric(28,22)',
                        'selected-by-default': True,
                        'inclusion': 'available'
                    }
                }, {
                    'numeric_38_3': {
                        'sql-datatype': 'numeric(38,3)',
                        'selected-by-default': True,
                        'inclusion': 'available'
                    }
                }],
                'schema':
                numeric_schema
            },
            'data_types_database_dbo_decimal_precisions': {
                'is-view':
                False,
                'schema-name':
                schema_name,
                'row-count':
                0,
                'values':
                decimal_values,
                'table-key-properties': {'pk'},
                'selected':
                None,
                'database-name':
                database_name,
                'stream_name':
                'decimal_precisions',
                'fields': [{
                    'pk': {
                        'sql-datatype': 'int',
                        'selected-by-default': True,
                        'inclusion': 'automatic'
                    }
                }, {
                    'decimal_9_4': {
                        'sql-datatype': 'decimal(9,4)',
                        'selected-by-default': True,
                        'inclusion': 'available'
                    }
                }, {
                    'decimal_19_6': {
                        'sql-datatype': 'decimal(19,6)',
                        'selected-by-default': True,
                        'inclusion': 'available'
                    }
                }, {
                    'decimal_28_6': {
                        'sql-datatype': 'decimal(28,6)',
                        'selected-by-default': True,
                        'inclusion': 'available'
                    }
                }, {
                    'decimal_38_13': {
                        'sql-datatype': 'decimal(38,13)',
                        'selected-by-default': True,
                        'inclusion': 'available'
                    }
                }],
                'schema':
                decimal_schema
            }
        }
        query_list = list(
            create_database(database_name, "Latin1_General_CS_AS"))
        # query_list.extend(create_schema(database_name, schema_name))

        # TODO - BUG https://stitchdata.atlassian.net/browse/SRCE-1075
        table_name = "numeric_precisions"
        precision_scale = numeric_precision_scale
        column_type = [
            "numeric({},{})".format(precision, scale)
            for precision, scale in precision_scale
        ]
        column_name = ["pk"] + [
            x.replace("(", "_").replace(",", "_").replace(")", "")
            for x in column_type
        ]
        column_type = ["int"] + column_type
        primary_key = {"pk"}
        column_def = [" ".join(x) for x in list(zip(column_name, column_type))]
        query_list.extend(
            create_table(database_name,
                         schema_name,
                         table_name,
                         column_def,
                         primary_key=primary_key))
        query_list.extend(
            insert(
                database_name, schema_name, table_name, cls.EXPECTED_METADATA[
                    "data_types_database_dbo_numeric_precisions"]["values"]))

        table_name = "decimal_precisions"
        precision_scale = decimal_precision_scale
        column_type = [
            "decimal({},{})".format(precision, scale)
            for precision, scale in precision_scale
        ]
        column_name = ["pk"] + [
            x.replace("(", "_").replace(",", "_").replace(")", "")
            for x in column_type
        ]
        column_type = ["int"] + column_type
        primary_key = {"pk"}
        column_def = [" ".join(x) for x in list(zip(column_name, column_type))]
        query_list.extend(
            create_table(database_name,
                         schema_name,
                         table_name,
                         column_def,
                         primary_key=primary_key))
        query_list.extend(
            insert(
                database_name, schema_name, table_name, cls.EXPECTED_METADATA[
                    "data_types_database_dbo_decimal_precisions"]["values"]))

        mssql_cursor_context_manager(*query_list)

        cls.expected_metadata = cls.discovery_expected_metadata
예제 #8
0
    def setUpClass(cls) -> None:
        """Create the expected schema in the test database"""

        database_name = "data_types_database"
        schema_name = "dbo"
        drop_all_user_databases()

        values = [
            (0, date(1, 1, 1), datetime(1753, 1, 1, 0, 0, tzinfo=timezone.utc),
             datetime(1, 1, 1, 0, 0, tzinfo=timezone.utc),
             datetime(1, 1, 1, 13, 46,
                      tzinfo=timezone(timedelta(hours=-14))).isoformat(),
             datetime(1900, 1, 1, 0, 0,
                      tzinfo=timezone.utc), time(0, 0, tzinfo=timezone.utc)),
            (1, date(9999, 12, 31),
             datetime(9999, 12, 31, 23, 59, 59, 997000, tzinfo=timezone.utc),
             datetime(9999, 12, 31, 23, 59, 59, 999000, tzinfo=timezone.utc),
             datetime(9999,
                      12,
                      31,
                      10,
                      14,
                      tzinfo=timezone(timedelta(hours=14))).isoformat(),
             datetime(2079, 6, 6, 23, 59, tzinfo=timezone.utc),
             time(23, 59, 59, tzinfo=timezone.utc)),
            (2, None, None, None, None, None, None),
            (3, date(4533, 6, 9),
             datetime(3099, 2, 6, 4, 27, 37, 983000, tzinfo=timezone.utc),
             datetime(9085, 4, 30, 21, 52, 57, 492920, tzinfo=timezone.utc),
             datetime(5749,
                      4,
                      3,
                      1,
                      47,
                      47,
                      110809,
                      tzinfo=timezone(timedelta(hours=10,
                                                minutes=5))).isoformat(),
             datetime(2031, 4, 30, 19, 32, tzinfo=timezone.utc),
             time(21, 9, 56, 0, tzinfo=timezone.utc)),
            (4, date(3476, 10, 14),
             datetime(7491, 4, 5, 8, 46, 0, 360000, tzinfo=timezone.utc),
             datetime(8366, 7, 13, 17, 15, 10, 102386, tzinfo=timezone.utc),
             datetime(2642,
                      6,
                      19,
                      21,
                      10,
                      28,
                      546280,
                      tzinfo=timezone(timedelta(hours=6,
                                                minutes=15))).isoformat(),
             datetime(2024, 6, 22, 0, 36, tzinfo=timezone.utc),
             time(2, 14, 4, 0, tzinfo=timezone.utc))
        ]

        schema = {
            'selected': True,
            'properties': {
                'its_time': {
                    'selected': True,
                    'inclusion': 'available',
                    'type': ['string', 'null']
                },
                'pk': {
                    'maximum': 2147483647,
                    'selected': True,
                    'inclusion': 'automatic',
                    'type': ['integer'],
                    'minimum': -2147483648
                },
                'just_a_date': {
                    'selected': True,
                    'inclusion': 'available',
                    'type': ['string', 'null'],
                    'format': 'date-time'
                },
                'date_and_time': {
                    'selected': True,
                    'inclusion': 'available',
                    'type': ['string', 'null'],
                    'format': 'date-time'
                },
                "bigger_range_and_precision_datetime": {
                    'selected': True,
                    'inclusion': 'available',
                    'type': ['string', 'null'],
                    'format': 'date-time'
                },
                "datetime_with_timezones": {
                    'selected': True,
                    'inclusion': 'available',
                    'type': ['string', 'null'],
                    'format': 'date-time'
                },
                "datetime_no_seconds": {
                    'selected': True,
                    'inclusion': 'available',
                    'type': ['string', 'null'],
                    'format': 'date-time'
                },
                "_sdc_deleted_at": {
                    'format': 'date-time',
                    'type': ['string', 'null']
                }
            },
            'type': 'object'
        }

        feilds = [{
            'pk': {
                'sql-datatype': 'int',
                'selected-by-default': True,
                'inclusion': 'automatic'
            }
        }, {
            'just_a_date': {
                'sql-datatype': 'date',
                'selected-by-default': True,
                'inclusion': 'available'
            }
        }, {
            'date_and_time': {
                'sql-datatype': 'datetime',
                'selected-by-default': True,
                'inclusion': 'available'
            }
        }, {
            'bigger_range_and_precision_datetime': {
                'sql-datatype': 'datetime2',
                'selected-by-default': True,
                'inclusion': 'available'
            }
        }, {
            'datetime_with_timezones': {
                'sql-datatype': 'datetimeoffest',
                'selected-by-default': True,
                'inclusion': 'available'
            }
        }, {
            'datetime_no_seconds': {
                'sql-datatype': 'smalldatetime',
                'selected-by-default': True,
                'inclusion': 'available'
            }
        }, {
            'its_time': {
                'sql-datatype': 'time',
                'selected-by-default': True,
                'inclusion': 'available'
            }
        }]

        query_list = list(
            create_database(database_name, "Latin1_General_CS_AS"))
        query_list.extend(enable_database_tracking(database_name))

        table_name = "dates_and_times"
        primary_key = {"pk"}

        column_name = [
            "pk", "just_a_date", "date_and_time",
            "bigger_range_and_precision_datetime", "datetime_with_timezones",
            "datetime_no_seconds", "its_time"
        ]
        column_type = [
            "int", "date", "datetime", "datetime2", "datetimeoffset",
            "smalldatetime", "time"
        ]

        column_def = [" ".join(x) for x in list(zip(column_name, column_type))]
        query_list.extend(
            create_table(database_name,
                         schema_name,
                         table_name,
                         column_def,
                         primary_key=primary_key,
                         tracking=True))
        query_list.extend(
            insert(database_name, schema_name, table_name, values))

        mssql_cursor_context_manager(*query_list)

        values = [
            (0, date(1, 1, 1), datetime(1753, 1, 1, 0, 0, tzinfo=timezone.utc),
             datetime(1, 1, 1, 0, 0, tzinfo=timezone.utc),
             datetime(1, 1, 1, 13, 46, tzinfo=timezone(
                 timedelta(hours=-14))).astimezone(timezone.utc),
             datetime(1900, 1, 1, 0, 0,
                      tzinfo=timezone.utc), time(0, 0, tzinfo=timezone.utc)),
            (1, date(9999, 12, 31),
             datetime(9999, 12, 31, 23, 59, 59, 997000, tzinfo=timezone.utc),
             datetime(9999, 12, 31, 23, 59, 59, 999000, tzinfo=timezone.utc),
             datetime(9999,
                      12,
                      31,
                      10,
                      14,
                      tzinfo=timezone(timedelta(hours=14))).astimezone(
                          timezone.utc),
             datetime(2079, 6, 6, 23, 59, tzinfo=timezone.utc),
             time(23, 59, 59, tzinfo=timezone.utc)),
            (2, None, None, None, None, None, None),
            (3, date(4533, 6, 9),
             datetime(3099, 2, 6, 4, 27, 37, 983000, tzinfo=timezone.utc),
             datetime(9085, 4, 30, 21, 52, 57, 492920, tzinfo=timezone.utc),
             datetime(5749,
                      4,
                      3,
                      1,
                      47,
                      47,
                      110809,
                      tzinfo=timezone(timedelta(
                          hours=10, minutes=5))).astimezone(timezone.utc),
             datetime(2031, 4, 30, 19, 32, tzinfo=timezone.utc),
             time(21, 9, 56, 0, tzinfo=timezone.utc)),
            (4, date(3476, 10, 14),
             datetime(7491, 4, 5, 8, 46, 0, 360000, tzinfo=timezone.utc),
             datetime(8366, 7, 13, 17, 15, 10, 102386, tzinfo=timezone.utc),
             datetime(2642,
                      6,
                      19,
                      21,
                      10,
                      28,
                      546280,
                      tzinfo=timezone(timedelta(
                          hours=6, minutes=15))).astimezone(timezone.utc),
             datetime(2024, 6, 22, 0, 36, tzinfo=timezone.utc),
             time(2, 14, 4, 0, tzinfo=timezone.utc))
        ]
        cls.EXPECTED_METADATA = {
            '{}_{}_{}'.format(database_name, schema_name, table_name): {
                'is-view': False,
                'schema-name': schema_name,
                'row-count': 0,
                'values': values,
                'table-key-properties': primary_key,
                'selected': None,
                'database-name': database_name,
                'stream_name': table_name,
                'fields': feilds,
                'schema': schema
            }
        }
        cls.expected_metadata = cls.discovery_expected_metadata
예제 #9
0
    def setUpClass(cls) -> None:
        """Create the expected schema in the test database"""
        drop_all_user_databases()
        database_name = "data_types_database"
        schema_name = "dbo"

        # use all valid unicode characters
        chars = list(range(0, 55296))
        chars.extend(range(57344, sys.maxunicode))
        chars.reverse()  # pop starting with ascii characters

        char_values = [(pk, "".join([chr(chars.pop()) for _ in range(2)]))
                       for pk in range(16)]
        char_name = "invalid_characters_{}".format("".join(OTHERS).replace(
            '"', ""))
        char_schema = {
            'type': 'object',
            'selected': True,
            'properties': {
                char_name: {
                    'type': ['string', 'null'],
                    'maxLength': 2,
                    'inclusion': 'available',
                    'selected': True
                },
                # 'minLength': 2},
                'pk': {
                    'maximum': 2147483647,
                    'type': ['integer'],
                    'inclusion': 'automatic',
                    'selected': True,
                    'minimum': -2147483648
                }
            }
        }

        varchar_values = [(pk, chr(chars.pop()), "".join([
            chr(chars.pop()) for _ in range(15)
        ]), "".join([chr(chars.pop()) for _ in range(randint(1, 16))]))
                          for pk in range(3)]
        varchar_name = "1834871389834_start_with_numbers"
        varchar_schema = {
            'type': 'object',
            'selected': True,
            'properties': {
                'pk': {
                    'maximum': 2147483647,
                    'type': ['integer'],
                    'inclusion': 'automatic',
                    'selected': True,
                    'minimum': -2147483648
                },
                'varchar_8000': {
                    'type': ['string', 'null'],
                    'maxLength': 8000,
                    'inclusion': 'available',
                    'selected': True
                },  # 'minLength': 0},
                varchar_name: {
                    'type': ['string', 'null'],
                    'maxLength': 5,
                    'inclusion': 'available',
                    'selected': True
                },
                # 'minLength': 0},
                'varchar_max': {
                    'type': ['string', 'null'],
                    'maxLength': 2147483647,
                    'inclusion': 'available',
                    'selected': True
                }
            }
        }
        # 'minLength': 0}}}

        nchar_values = [(pk, "".join([chr(chars.pop()) for _ in range(4)]))
                        for pk in range(3)]
        #  expect that values are right padded with spaces in the db.
        nchar_values = [
            (x, "{}{}".format(y,
                              " " * ((16 - len(y.encode('utf-16-le'))) // 2)))
            for x, y in nchar_values
        ]
        nchar_name = "SELECT"
        nchar_schema = {
            'type': 'object',
            'selected': True,
            'properties': {
                nchar_name: {
                    'type': ['string', 'null'],
                    'maxLength': 8,
                    'inclusion': 'available',
                    'selected': True
                },
                # 'minLength': 8},  # length is based on bytes, not characters
                'pk': {
                    'maximum': 2147483647,
                    'type': ['integer'],
                    'inclusion': 'automatic',
                    'selected': True,
                    'minimum': -2147483648
                }
            }
        }

        chars.reverse()
        nvarchar_values = [(pk, chr(chars.pop()), "".join([
            chr(chars.pop()) for _ in range(8)
        ]), "".join([chr(chars.pop()) for _ in range(randint(1, 8))]))
                           for pk in range(4)]
        nvarchar_name = "hebrew_ישראל"
        nvarchar_schema = {
            'type': 'object',
            'selected': True,
            'properties': {
                'nvarchar_max': {
                    'type': ['string', 'null'],
                    'maxLength': 2147483647,
                    'inclusion': 'available',
                    'selected': True
                },
                # 'minLength': 0},
                'pk': {
                    'maximum': 2147483647,
                    'type': ['integer'],
                    'inclusion': 'automatic',
                    'selected': True,
                    'minimum': -2147483648
                },
                'nvarchar_4000': {
                    'type': ['string', 'null'],
                    'maxLength': 4000,
                    'inclusion': 'available',
                    'selected': True
                },
                # 'minLength': 0},
                nvarchar_name: {
                    'type': ['string', 'null'],
                    'maxLength': 5,
                    'inclusion': 'available',
                    'selected': True
                }
            }
        }
        # 'minLength': 0}}}

        cls.EXPECTED_METADATA = {
            'data_types_database_dbo_{}'.format(char_name): {
                'is-view':
                False,
                'schema-name':
                schema_name,
                'row-count':
                0,
                'values':
                char_values,
                'table-key-properties': {'pk'},
                'selected':
                None,
                'database-name':
                database_name,
                'stream_name':
                char_name,
                'fields': [{
                    'pk': {
                        'sql-datatype': 'int',
                        'selected-by-default': True,
                        'inclusion': 'automatic'
                    }
                }, {
                    char_name: {
                        'sql-datatype': 'char',
                        'selected-by-default': True,
                        'inclusion': 'available'
                    }
                }],
                'schema':
                char_schema
            },
            'data_types_database_dbo_{}'.format(varchar_name): {
                'is-view':
                False,
                'schema-name':
                schema_name,
                'row-count':
                0,
                'values':
                varchar_values,
                'table-key-properties': {'pk'},
                'selected':
                None,
                'database-name':
                database_name,
                'stream_name':
                varchar_name,
                'fields': [{
                    'pk': {
                        'sql-datatype': 'int',
                        'selected-by-default': True,
                        'inclusion': 'automatic'
                    }
                }, {
                    varchar_name: {
                        'sql-datatype': 'varchar',
                        'selected-by-default': True,
                        'inclusion': 'available'
                    }
                }, {
                    'varchar_8000': {
                        'sql-datatype': 'varchar',
                        'selected-by-default': True,
                        'inclusion': 'available'
                    }
                }, {
                    'varchar_max': {
                        'sql-datatype': 'varchar',
                        'selected-by-default': True,
                        'inclusion': 'available'
                    }
                }],
                'schema':
                varchar_schema
            },
            'data_types_database_dbo_{}'.format(nchar_name): {
                'is-view':
                False,
                'schema-name':
                schema_name,
                'row-count':
                0,
                'values':
                nchar_values,
                'table-key-properties': {'pk'},
                'selected':
                None,
                'database-name':
                database_name,
                'stream_name':
                nchar_name,
                'fields': [{
                    'pk': {
                        'sql-datatype': 'int',
                        'selected-by-default': True,
                        'inclusion': 'automatic'
                    }
                }, {
                    nchar_name: {
                        'sql-datatype': 'nchar',
                        'selected-by-default': True,
                        'inclusion': 'available'
                    }
                }],
                'schema':
                nchar_schema
            },
            'data_types_database_dbo_{}'.format(nvarchar_name): {
                'is-view':
                False,
                'schema-name':
                schema_name,
                'row-count':
                0,
                'values':
                nvarchar_values,
                'table-key-properties': {'pk'},
                'selected':
                None,
                'database-name':
                database_name,
                'stream_name':
                nvarchar_name,
                'fields': [{
                    'pk': {
                        'sql-datatype': 'int',
                        'selected-by-default': True,
                        'inclusion': 'automatic'
                    }
                }, {
                    nvarchar_name: {
                        'sql-datatype': 'nvarchar',
                        'selected-by-default': True,
                        'inclusion': 'available'
                    }
                }, {
                    'nvarchar_4000': {
                        'sql-datatype': 'nvarchar',
                        'selected-by-default': True,
                        'inclusion': 'available'
                    }
                }, {
                    'nvarchar_max': {
                        'sql-datatype': 'nvarchar',
                        'selected-by-default': True,
                        'inclusion': 'available'
                    }
                }],
                'schema':
                nvarchar_schema
            },
        }
        query_list = list(
            create_database(database_name, "Latin1_General_CS_AS"))

        table_name = '"{}"'.format(char_name)
        column_name = ["pk", table_name]  # , "char_8000"]
        column_type = ["int", "char(2)"]  # , "char(8000)"]
        primary_key = {"pk"}
        column_def = [" ".join(x) for x in list(zip(column_name, column_type))]
        query_list.extend(
            create_table(database_name,
                         schema_name,
                         table_name,
                         column_def,
                         primary_key=primary_key))
        query_list.extend(
            insert(database_name, schema_name, table_name, char_values))

        table_name = "[{}]".format(varchar_name)
        column_name = ["pk", table_name, "varchar_8000", "varchar_max"]
        column_type = ["int", "varchar(5)", "varchar(8000)", "varchar(max)"]
        primary_key = {"pk"}
        column_def = [" ".join(x) for x in list(zip(column_name, column_type))]
        query_list.extend(
            create_table(database_name,
                         schema_name,
                         table_name,
                         column_def,
                         primary_key=primary_key))
        query_list.extend(
            insert(database_name, schema_name, table_name, varchar_values))

        table_name = "[{}]".format(nchar_name)
        column_name = ["pk", "[{}]".format(nchar_name)]
        column_type = ["int", "nchar(8)"]  # , "nchar(4000)"]
        primary_key = {"pk"}
        column_def = [" ".join(x) for x in list(zip(column_name, column_type))]
        query_list.extend(
            create_table(database_name,
                         schema_name,
                         table_name,
                         column_def,
                         primary_key=primary_key))
        # strip padding off query data
        nchar_query_values = [(x, y.rstrip() if isinstance(y, str) else y)
                              for x, y in nchar_values]
        query_list.extend(
            insert(database_name, schema_name, table_name, nchar_query_values))

        table_name = nvarchar_name
        column_name = ["pk", nvarchar_name, "nvarchar_4000", "nvarchar_max"]
        column_type = ["int", "nvarchar(5)", "nvarchar(4000)", "nvarchar(max)"]
        primary_key = {"pk"}
        column_def = [" ".join(x) for x in list(zip(column_name, column_type))]
        query_list.extend(
            create_table(database_name,
                         schema_name,
                         table_name,
                         column_def,
                         primary_key=primary_key))

        query_list.extend(
            insert(database_name, schema_name, table_name, nvarchar_values))
        query_list.extend(
            ['-- there are {} characters left to test'.format(len(chars))])

        cls.expected_metadata = cls.discovery_expected_metadata

        mssql_cursor_context_manager(*query_list)
예제 #10
0
    def setUpClass(cls) -> None:
        """Create the expected schema in the test database"""
        drop_all_user_databases()
        database_name = "database_name"
        query_list = list(create_database(database_name, "Latin1_General_CS_AS"))

        schema_name = "schema_name"
        query_list.extend(create_schema(database_name, schema_name))

        table_name = "table_name"
        column_name = ["column_name"]
        column_type = ["int"]
        primary_key = set()
        column_def = [" ".join(x) for x in list(zip(column_name, column_type))]
        query_list.extend(create_table(database_name, schema_name, table_name, column_def,
                                       primary_key=primary_key))
        cls.add_expected_metadata(cls, database_name, schema_name, table_name, column_name,
                                  column_type, primary_key)

        table_name = "TABLE_NAME"
        column_name = ["column_name"]
        column_type = ["int"]
        primary_key = set()
        column_def = [" ".join(x) for x in list(zip(column_name, column_type))]
        query_list.extend(create_table(database_name, schema_name, table_name, column_def,
                                       primary_key=primary_key))
        cls.add_expected_metadata(cls, database_name, schema_name, table_name, column_name,
                                  column_type, primary_key)

        schema_name = "SCHEMA_NAME"
        query_list.extend(create_schema(database_name, schema_name))

        table_name = "table_name"
        column_name = ["column_name"]
        column_type = ["int"]
        primary_key = set()
        column_def = [" ".join(x) for x in list(zip(column_name, column_type))]
        query_list.extend(create_table(database_name, schema_name, table_name, column_def,
                                       primary_key=primary_key))
        cls.add_expected_metadata(cls, database_name, schema_name, table_name, column_name,
                                  column_type, primary_key)

        table_name = "TABLE_NAME"
        column_name = ["column_name"]
        column_type = ["int"]
        primary_key = set()
        column_def = [" ".join(x) for x in list(zip(column_name, column_type))]
        query_list.extend(create_table(database_name, schema_name, table_name, column_def,
                                       primary_key=primary_key))
        cls.add_expected_metadata(cls, database_name, schema_name, table_name, column_name,
                                  column_type, primary_key)

        database_name = "DATABASE_NAME_NO_COLLISION"
        query_list.extend(create_database(database_name, "Latin1_General_CS_AS"))

        schema_name = "schema_name"
        query_list.extend(create_schema(database_name, schema_name))

        table_name = "table_name"
        column_name = ["column_name"]
        column_type = ["int"]
        primary_key = set()
        column_def = [" ".join(x) for x in list(zip(column_name, column_type))]
        query_list.extend(create_table(database_name, schema_name, table_name, column_def,
                                       primary_key=primary_key))
        cls.add_expected_metadata(cls, database_name, schema_name, table_name, column_name,
                                  column_type, primary_key)

        table_name = "TABLE_NAME"
        column_name = ["column_name"]
        column_type = ["int"]
        primary_key = set()
        column_def = [" ".join(x) for x in list(zip(column_name, column_type))]
        query_list.extend(create_table(database_name, schema_name, table_name, column_def,
                                       primary_key=primary_key))
        cls.add_expected_metadata(cls, database_name, schema_name, table_name, column_name,
                                  column_type, primary_key)

        schema_name = "SCHEMA_NAME"
        query_list.extend(create_schema(database_name, schema_name))

        table_name = "table_name"
        column_name = ["column_name"]
        column_type = ["int"]
        primary_key = set()
        column_def = [" ".join(x) for x in list(zip(column_name, column_type))]
        query_list.extend(create_table(database_name, schema_name, table_name, column_def,
                                       primary_key=primary_key))
        cls.add_expected_metadata(cls, database_name, schema_name, table_name, column_name,
                                  column_type, primary_key)

        table_name = "TABLE_NAME"
        column_name = ["column_name"]
        column_type = ["int"]
        primary_key = set()
        column_def = [" ".join(x) for x in list(zip(column_name, column_type))]
        query_list.extend(create_table(database_name, schema_name, table_name, column_def,
                                       primary_key=primary_key))
        cls.add_expected_metadata(cls, database_name, schema_name, table_name, column_name,
                                  column_type, primary_key)

        mssql_cursor_context_manager(*query_list)

        cls.expected_metadata = cls.discovery_expected_metadata
예제 #11
0
    def setUpClass(cls) -> None:
        """Create the expected schema in the test database"""
        drop_all_user_databases()
        database_name = "constraints_database"
        schema_name = "dbo"

        query_list = list(
            create_database(database_name, "Latin1_General_CS_AS"))
        # query_list.extend(create_schema(database_name, schema_name))

        table_name = "no_constraints"
        column_name = ["column_name"]
        column_type = ["int"]
        primary_key = set()  # ["bigint"]
        column_def = [" ".join(x) for x in list(zip(column_name, column_type))]
        query_list.extend(
            create_table(database_name,
                         schema_name,
                         table_name,
                         column_def,
                         primary_key=primary_key))
        cls.add_expected_metadata(cls, database_name, schema_name, table_name,
                                  column_name, column_type, primary_key)

        table_name = "multiple_column_pk"
        column_name = ["first_name", "last_name", "info"]
        column_type = ["varchar(256)", "varchar(256)", "int"]
        primary_key = ["first_name", "last_name"]
        column_def = [" ".join(x) for x in list(zip(column_name, column_type))]
        query_list.extend(
            create_table(database_name,
                         schema_name,
                         table_name,
                         column_def,
                         primary_key=primary_key))
        cls.add_expected_metadata(cls, database_name, schema_name, table_name,
                                  column_name, column_type, primary_key)

        table_name = "single_column_pk"
        column_name = ["pk", "data"]
        column_type = ["int", "int"]
        primary_key = ["pk"]
        column_def = [" ".join(x) for x in list(zip(column_name, column_type))]
        query_list.extend(
            create_table(database_name,
                         schema_name,
                         table_name,
                         column_def,
                         primary_key=primary_key))
        cls.add_expected_metadata(cls, database_name, schema_name, table_name,
                                  column_name, column_type, primary_key)

        table_name = "pk_with_unique_not_null"
        column_name = ["pk", "data"]
        column_type = ["int", "int NOT NULL UNIQUE"]
        primary_key = ["pk"]
        column_def = [" ".join(x) for x in list(zip(column_name, column_type))]
        query_list.extend(
            create_table(database_name,
                         schema_name,
                         table_name,
                         column_def,
                         primary_key=primary_key))
        column_type = ["int", "int"]
        cls.add_expected_metadata(cls, database_name, schema_name, table_name,
                                  column_name, column_type, primary_key)

        table_name = "pk_with_fk"
        column_name = ["pk", "fk"]
        column_type = ["int", "int"]
        primary_key = ["pk"]
        foreign_key = "fk"
        reference = "{}.pk_with_unique_not_null(pk)".format(schema_name)
        column_def = [" ".join(x) for x in list(zip(column_name, column_type))]
        query_list.extend(
            create_table(database_name,
                         schema_name,
                         table_name,
                         column_def,
                         primary_key=primary_key,
                         foreign_key=foreign_key,
                         reference=reference))
        cls.add_expected_metadata(cls, database_name, schema_name, table_name,
                                  column_name, column_type, primary_key)

        table_name = "view_with_join"
        column_name = ["column1", "data", "column2"]
        column_type = ["int", "int", "int"]
        primary_key = []
        select = ("SELECT p.pk as column1, data, f.pk as column2 "
                  "FROM pk_with_unique_not_null p "
                  "JOIN pk_with_fk f on p.pk = f.fk")
        query_list.extend(create_view(schema_name, table_name, select))
        cls.add_expected_metadata(cls,
                                  database_name,
                                  schema_name,
                                  table_name,
                                  column_name,
                                  column_type,
                                  primary_key,
                                  view=True)

        table_name = "table_with_index"
        column_name = ["not_pk", "data"]
        column_type = ["int", "int NOT NULL INDEX myindex"]
        primary_key = []
        column_def = [" ".join(x) for x in list(zip(column_name, column_type))]
        query_list.extend(
            create_table(database_name,
                         schema_name,
                         table_name,
                         column_def,
                         primary_key=primary_key))
        column_type = ["int", "int"]
        cls.add_expected_metadata(cls, database_name, schema_name, table_name,
                                  column_name, column_type, primary_key)

        table_name = "default_column"
        column_name = ["pk", "created_at"]
        column_type = ["int", "datetimeoffset DEFAULT CURRENT_TIMESTAMP"]
        primary_key = ["pk"]
        column_def = [" ".join(x) for x in list(zip(column_name, column_type))]
        query_list.extend(
            create_table(database_name,
                         schema_name,
                         table_name,
                         column_def,
                         primary_key=primary_key))
        column_type = ["int", "datetimeoffset"]
        cls.add_expected_metadata(cls, database_name, schema_name, table_name,
                                  column_name, column_type, primary_key)

        table_name = "check_constraint"
        column_name = ["pk", "birthday"]
        column_type = [
            "int", "datetimeoffset CHECK (birthday <= CURRENT_TIMESTAMP)"
        ]
        primary_key = ["pk"]
        column_def = [" ".join(x) for x in list(zip(column_name, column_type))]
        query_list.extend(
            create_table(database_name,
                         schema_name,
                         table_name,
                         column_def,
                         primary_key=primary_key))
        column_type = ["int", "datetimeoffset"]
        cls.add_expected_metadata(cls, database_name, schema_name, table_name,
                                  column_name, column_type, primary_key)

        # TODO add in identity to this test.
        mssql_cursor_context_manager(*query_list)

        cls.expected_metadata = cls.discovery_expected_metadata
예제 #12
0
    def test_run(self):
        """stream_expected_data[self.VALUES]
        Verify that a full sync can send capture all data and send it in the correct format
        for integer and boolean (bit) data.
        Verify that the fist sync sends an activate immediately.
        Verify that the table version is incremented up
        """
        print("running test {}".format(self.name()))

        conn_id = self.create_connection()

        # run in check mode
        check_job_name = runner.run_check_mode(self, conn_id)

        # verify check  exit codes
        exit_status = menagerie.get_exit_status(conn_id, check_job_name)
        menagerie.verify_check_exit_status(self, exit_status, check_job_name)

        # get the catalog information of discovery
        found_catalogs = menagerie.get_catalogs(conn_id)
        additional_md = [{
            "breadcrumb": [],
            "metadata": {
                'replication-method': 'INCREMENTAL',
                'replication-key': 'replication_key_column'
            }
        }]
        BaseTapTest.select_all_streams_and_fields(conn_id,
                                                  found_catalogs,
                                                  additional_md=additional_md)

        # clear state
        menagerie.set_state(conn_id, {})
        sync_job_name = runner.run_sync_mode(self, conn_id)

        # verify tap and target exit codes
        exit_status = menagerie.get_exit_status(conn_id, sync_job_name)
        menagerie.verify_sync_exit_status(self, exit_status, sync_job_name)

        # verify record counts of streams
        record_count_by_stream = runner.examine_target_output_file(
            self, conn_id, self.expected_streams(),
            self.expected_primary_keys_by_stream_id())
        expected_count = {
            k: len(v['values'])
            for k, v in self.expected_metadata().items()
        }
        self.assertEqual(record_count_by_stream, expected_count)

        # verify records match on the first sync
        records_by_stream = runner.get_records_from_target_output()

        table_version = dict()
        for stream in self.expected_streams():
            with self.subTest(stream=stream):
                stream_expected_data = self.expected_metadata()[stream]
                table_version[stream] = records_by_stream[stream][
                    'table_version']

                # verify on the first sync you get
                # activate version message before and after all data for the full table
                # and before the logical replication part
                self.assertEqual(
                    records_by_stream[stream]['messages'][0]['action'],
                    'activate_version')
                self.assertEqual(
                    records_by_stream[stream]['messages'][-1]['action'],
                    'activate_version')
                self.assertTrue(
                    all([
                        m["action"] == "upsert"
                        for m in records_by_stream[stream]['messages'][1:-1]
                    ]),
                    msg="Expect all but the first message to be upserts")
                self.assertEqual(
                    len(stream_expected_data[self.VALUES]),
                    len(records_by_stream[stream]['messages'][1:-1]),
                    msg="incorrect number of upserts")

                column_names = [
                    list(field_data.keys())[0]
                    for field_data in stream_expected_data[self.FIELDS]
                ]

                expected_messages = [{
                    "action": "upsert",
                    "data": {
                        column: value
                        for column, value in list(zip(column_names,
                                                      row_values))
                    }
                } for row_values in sorted(stream_expected_data[self.VALUES],
                                           key=lambda row:
                                           (row[-1] is not None, row[-1]))]

                # Verify all data is correct for incremental
                for expected_row, actual_row in list(
                        zip(expected_messages,
                            records_by_stream[stream]['messages'][1:-1])):
                    with self.subTest(expected_row=expected_row):
                        self.assertEqual(actual_row["action"], "upsert")
                        self.assertEqual(
                            len(expected_row["data"].keys()),
                            len(actual_row["data"].keys()),
                            msg="there are not the same number of columns")
                        for column_name, expected_value in expected_row[
                                "data"].items():
                            if isinstance(expected_value, Decimal):
                                self.assertEqual(
                                    type(actual_row["data"][column_name]),
                                    Decimal,
                                    msg=
                                    "decimal value is not represented as a number"
                                )
                                self.assertEqual(
                                    expected_value,
                                    actual_row["data"][column_name],
                                    msg="expected: {} != actual {}".format(
                                        expected_row, actual_row))
                            else:
                                self.assertEqual(
                                    expected_value,
                                    actual_row["data"][column_name],
                                    msg="expected: {} != actual {}".format(
                                        expected_row, actual_row))
                print("records are correct for stream {}".format(stream))

                # verify state and bookmarks
                state = menagerie.get_state(conn_id)
                bookmark = state['bookmarks'][stream]

                self.assertIsNone(
                    state.get('currently_syncing'),
                    msg="expected state's currently_syncing to be None")
                self.assertIsNone(bookmark.get('current_log_version'),
                                  msg="no log_version for incremental")
                self.assertIsNone(bookmark.get('initial_full_table_complete'),
                                  msg="no full table for incremental")
                # find the max value of the replication key
                expected_bookmark = max([
                    row[-1] for row in stream_expected_data[self.VALUES]
                    if row[-1] is not None
                ])
                # currently decimal replication keys aren't supported in the front end.  If they are at a later point
                # this should be a decimal comparison. https://stitchdata.atlassian.net/browse/SRCE-1331
                self.assertEqual(bookmark['replication_key_value'],
                                 float(expected_bookmark))
                # self.assertEqual(bookmark['replication_key'], 'replication_key_value')

                self.assertEqual(
                    bookmark['version'],
                    table_version[stream],
                    msg="expected bookmark for stream to match version")

                expected_schemas = self.expected_metadata()[stream]['schema']
                self.assertEqual(
                    records_by_stream[stream]['schema'],
                    simplejson.loads(simplejson.dumps(expected_schemas),
                                     use_decimal=True),
                    msg="expected: {} != actual: {}".format(
                        expected_schemas, records_by_stream[stream]['schema']))

        # ----------------------------------------------------------------------
        # invoke the sync job AGAIN and after insert, update, delete or rows
        # ----------------------------------------------------------------------

        database_name = "data_types_database"
        schema_name = "dbo"
        table_name = "numeric_precisions"
        precision_scale = NUMERIC_PRECISION_SCALE
        column_type = [
            "numeric({},{})".format(precision, scale)
            for precision, scale in precision_scale
        ]
        column_name = ["pk"] + [
            x.replace("(", "_").replace(",", "_").replace(")", "")
            for x in column_type[:-1]
        ] + ["replication_key_column"]
        insert_value = [
            (8, Decimal(100), Decimal(100), Decimal(100), Decimal(100)),
            (7, Decimal('99999.9995'), Decimal('9999999.999999999999'),
             Decimal('999999.9999999999999999999999'),
             Decimal('99999999999999999999999999999999999.995'))
        ]
        update_value = [
            (5, Decimal(100), Decimal(100), Decimal(100), Decimal(100)),
            (6, Decimal('99999.9999'), Decimal('9999999.999999999999'),
             Decimal('999999.9999999999999999999999'),
             Decimal('99999999999999999999999999999999999.999'))
        ]
        delete_value = [(4, )]
        query_list = (insert(database_name, schema_name, table_name,
                             insert_value))
        query_list.extend(
            delete_by_pk(database_name, schema_name, table_name, delete_value,
                         column_name[:1]))
        query_list.extend(
            update_by_pk(database_name, schema_name, table_name, update_value,
                         column_name))
        mssql_cursor_context_manager(*query_list)
        insert_value = insert_value[-1:]  # only repl_key >= gets included
        update_value = update_value[-1:]
        self.EXPECTED_METADATA["data_types_database_dbo_numeric_precisions"][
            "values"] = [
                (3, Decimal('99999.9993'), Decimal('9999999.999999999999'),
                 Decimal('999999.9999999999999999999999'),
                 Decimal('99999999999999999999999999999999999.993')),
            ] + update_value + insert_value

        database_name = "data_types_database"
        schema_name = "dbo"
        table_name = "decimal_precisions"
        precision_scale = DECIMAL_PRECISION_SCALE
        column_type = [
            "decimal({},{})".format(precision, scale)
            for precision, scale in precision_scale
        ]
        column_name = ["pk"] + [
            x.replace("(", "_").replace(",", "_").replace(")", "")
            for x in column_type[:-1]
        ] + ["replication_key_column"]
        insert_value = [
            (8, Decimal(100), Decimal(100), Decimal(100), Decimal(100)),
            (7, Decimal('99999.9995'), Decimal('9999999999999.999999'),
             Decimal('9999999999999999999999.999999'),
             Decimal('9999999999999999999999999.9999999999995'))
        ]
        update_value = [
            (5, Decimal(100), Decimal(100), Decimal(100), Decimal(100)),
            (6, Decimal('99999.9999'), Decimal('9999999999999.999999'),
             Decimal('9999999999999999999999.999999'),
             Decimal('9999999999999999999999999.9999999999999'))
        ]
        delete_value = [(4, )]
        query_list = (insert(database_name, schema_name, table_name,
                             insert_value))
        query_list.extend(
            delete_by_pk(database_name, schema_name, table_name, delete_value,
                         column_name[:1]))
        query_list.extend(
            update_by_pk(database_name, schema_name, table_name, update_value,
                         column_name))
        mssql_cursor_context_manager(*query_list)
        insert_value = insert_value[-1:]  # only repl_key >= gets included
        update_value = update_value[-1:]
        self.EXPECTED_METADATA["data_types_database_dbo_decimal_precisions"][
            "values"] = [
                (3, Decimal('99999.9993'), Decimal('9999999999999.999999'),
                 Decimal('9999999999999999999999.999999'),
                 Decimal('9999999999999999999999999.9999999999993'))
            ] + update_value + insert_value

        sync_job_name = runner.run_sync_mode(self, conn_id)

        # verify tap and target exit codes
        exit_status = menagerie.get_exit_status(conn_id, sync_job_name)
        menagerie.verify_sync_exit_status(self, exit_status, sync_job_name)
        record_count_by_stream = runner.examine_target_output_file(
            self, conn_id, self.expected_streams(),
            self.expected_primary_keys_by_stream_id())
        expected_count = {
            k: len(v['values'])
            for k, v in self.expected_metadata().items()
        }
        self.assertEqual(record_count_by_stream, expected_count)
        records_by_stream = runner.get_records_from_target_output()

        for stream in self.expected_streams():
            with self.subTest(stream=stream):
                stream_expected_data = self.expected_metadata()[stream]
                new_table_version = records_by_stream[stream]['table_version']

                # verify on a subsequent sync you get activate version message only after all data
                self.assertEqual(
                    records_by_stream[stream]['messages'][0]['action'],
                    'activate_version')
                self.assertEqual(
                    records_by_stream[stream]['messages'][-1]['action'],
                    'activate_version')
                self.assertTrue(
                    all([
                        message["action"] == "upsert" for message in
                        records_by_stream[stream]['messages'][1:-1]
                    ]))

                column_names = [
                    list(field_data.keys())[0]
                    for field_data in stream_expected_data[self.FIELDS]
                ]

                expected_messages = [{
                    "action": "upsert",
                    "data": {
                        column: value
                        for column, value in list(zip(column_names,
                                                      row_values))
                    }
                } for row_values in sorted(stream_expected_data[self.VALUES],
                                           key=lambda row:
                                           (row[-1] is not None, row[-1]))]

                # remove sequences from actual values for comparison
                [
                    message.pop("sequence")
                    for message in records_by_stream[stream]['messages'][1:-1]
                ]

                # Verify all data is correct
                for expected_row, actual_row in list(
                        zip(expected_messages,
                            records_by_stream[stream]['messages'][1:-1])):
                    with self.subTest(expected_row=expected_row):
                        self.assertEqual(actual_row["action"], "upsert")

                        # we only send the _sdc_deleted_at column for deleted rows
                        self.assertEqual(
                            len(expected_row["data"].keys()),
                            len(actual_row["data"].keys()),
                            msg="there are not the same number of columns")
                        for column_name, expected_value in expected_row[
                                "data"].items():
                            if isinstance(expected_value, Decimal):
                                self.assertEqual(
                                    type(actual_row["data"][column_name]),
                                    Decimal,
                                    msg=
                                    "decimal value is not represented as a number"
                                )
                                self.assertEqual(
                                    expected_value,
                                    actual_row["data"][column_name],
                                    msg="expected: {} != actual {}".format(
                                        expected_row, actual_row))
                            else:
                                self.assertEqual(
                                    expected_value,
                                    actual_row["data"][column_name],
                                    msg="expected: {} != actual {}".format(
                                        expected_row, actual_row))

                print("records are correct for stream {}".format(stream))

                # verify state and bookmarks
                state = menagerie.get_state(conn_id)
                bookmark = state['bookmarks'][stream]

                self.assertIsNone(
                    state.get('currently_syncing'),
                    msg="expected state's currently_syncing to be None")
                self.assertIsNone(bookmark.get('current_log_version'),
                                  msg="no log_version for incremental")
                self.assertIsNone(bookmark.get('initial_full_table_complete'),
                                  msg="no full table for incremental")
                # find the max value of the replication key
                expected_bookmark = max([
                    row[-1] for row in stream_expected_data[self.VALUES]
                    if row[-1] is not None
                ])
                # currently decimal replication keys aren't supported in the front end.  If they are at a later point
                # this should be a decimal comparison. https://stitchdata.atlassian.net/browse/SRCE-1331
                self.assertEqual(bookmark['replication_key_value'],
                                 float(expected_bookmark))
                # self.assertEqual(bookmark['replication_key'], 'replication_key_value')

                self.assertEqual(
                    bookmark['version'],
                    table_version[stream],
                    msg="expected bookmark for stream to match version")
                self.assertEqual(
                    bookmark['version'],
                    new_table_version,
                    msg="expected bookmark for stream to match version")

                state = menagerie.get_state(conn_id)
                bookmark = state['bookmarks'][stream]

                expected_schemas = self.expected_metadata()[stream]['schema']
                self.assertEqual(
                    records_by_stream[stream]['schema'],
                    simplejson.loads(simplejson.dumps(expected_schemas),
                                     use_decimal=True),
                    msg="expected: {} != actual: {}".format(
                        expected_schemas, records_by_stream[stream]['schema']))
예제 #13
0
    def setUpClass(cls) -> None:
        """Create the expected schema in the test database"""

        drop_all_user_databases()
        database_name = "database_name"
        query_list = list(
            create_database(database_name, "Latin1_General_CS_AS"))

        schema_name = "schema_name"
        query_list.extend(create_schema(database_name, schema_name))

        table_name = "table_name"
        fields = [{
            'pk': {
                'sql-datatype': 'int',
                'selected-by-default': True,
                'inclusion': 'automatic'
            }
        }, {
            'column_name': {
                'sql-datatype': 'int',
                'selected-by-default': True,
                'inclusion': 'available'
            }
        }]
        schema = {
            'type': 'object',
            'properties': {
                'column_name': {
                    'type': ['integer', 'null'],
                    'minimum': -2147483648,
                    'maximum': 2147483647,
                    'inclusion': 'available',
                    'selected': True
                },
                'pk': {
                    'type': ['integer'],
                    'minimum': -2147483648,
                    'maximum': 2147483647,
                    'inclusion': 'automatic',
                    'selected': True
                }
            },
            'selected': True
        }

        cls.EXPECTED_METADATA = {
            '{}_{}_{}'.format(database_name, schema_name, table_name): {
                'is-view': False,
                'schema-name': schema_name,
                'row-count': 0,
                'values': [(0, 1), (1, 2)],
                'table-key-properties': {'pk'},
                'selected': None,
                'database-name': database_name,
                'stream_name': table_name,
                'fields': fields,
                'schema': schema
            },
        }

        column_name = ["pk", "column_name"]
        column_type = ["int", "int"]
        primary_key = {"pk"}
        column_def = [" ".join(x) for x in list(zip(column_name, column_type))]
        query_list.extend(
            create_table(database_name,
                         schema_name,
                         table_name,
                         column_def,
                         primary_key=primary_key))
        query_list.extend(
            insert(
                database_name, schema_name, table_name,
                cls.EXPECTED_METADATA['{}_{}_{}'.format(
                    database_name, schema_name, table_name)]["values"]))

        table_name = "TABLE_NAME"

        cls.EXPECTED_METADATA['{}_{}_{}'.format(database_name, schema_name,
                                                table_name)] = {
                                                    'is-view': False,
                                                    'schema-name': schema_name,
                                                    'row-count': 0,
                                                    'values': [(0, 3), (1, 4)],
                                                    'table-key-properties':
                                                    {'pk'},
                                                    'selected': None,
                                                    'database-name':
                                                    database_name,
                                                    'stream_name': table_name,
                                                    'fields': fields,
                                                    'schema': schema
                                                }

        column_name = ["pk", "column_name"]
        column_type = ["int", "int"]
        primary_key = {"pk"}
        column_def = [" ".join(x) for x in list(zip(column_name, column_type))]
        query_list.extend(
            create_table(database_name,
                         schema_name,
                         table_name,
                         column_def,
                         primary_key=primary_key))
        query_list.extend(
            insert(
                database_name, schema_name, table_name,
                cls.EXPECTED_METADATA['{}_{}_{}'.format(
                    database_name, schema_name, table_name)]["values"]))

        schema_name = "SCHEMA_NAME"
        query_list.extend(create_schema(database_name, schema_name))

        table_name = "table_name"

        cls.EXPECTED_METADATA['{}_{}_{}'.format(database_name, schema_name,
                                                table_name)] = {
                                                    'is-view': False,
                                                    'schema-name': schema_name,
                                                    'row-count': 0,
                                                    'values': [(0, 5), (1, 6)],
                                                    'table-key-properties':
                                                    {'pk'},
                                                    'selected': None,
                                                    'database-name':
                                                    database_name,
                                                    'stream_name': table_name,
                                                    'fields': fields,
                                                    'schema': schema
                                                }

        column_name = ["pk", "column_name"]
        column_type = ["int", "int"]
        primary_key = {"pk"}
        column_def = [" ".join(x) for x in list(zip(column_name, column_type))]
        query_list.extend(
            create_table(database_name,
                         schema_name,
                         table_name,
                         column_def,
                         primary_key=primary_key))
        query_list.extend(
            insert(
                database_name, schema_name, table_name,
                cls.EXPECTED_METADATA['{}_{}_{}'.format(
                    database_name, schema_name, table_name)]["values"]))

        table_name = "TABLE_NAME"

        cls.EXPECTED_METADATA['{}_{}_{}'.format(database_name, schema_name,
                                                table_name)] = {
                                                    'is-view': False,
                                                    'schema-name': schema_name,
                                                    'row-count': 0,
                                                    'values': [(0, 7), (1, 8)],
                                                    'table-key-properties':
                                                    {'pk'},
                                                    'selected': None,
                                                    'database-name':
                                                    database_name,
                                                    'stream_name': table_name,
                                                    'fields': fields,
                                                    'schema': schema
                                                }

        column_name = ["pk", "column_name"]
        column_type = ["int", "int"]
        primary_key = {"pk"}
        column_def = [" ".join(x) for x in list(zip(column_name, column_type))]
        query_list.extend(
            create_table(database_name,
                         schema_name,
                         table_name,
                         column_def,
                         primary_key=primary_key))
        query_list.extend(
            insert(
                database_name, schema_name, table_name,
                cls.EXPECTED_METADATA['{}_{}_{}'.format(
                    database_name, schema_name, table_name)]["values"]))

        database_name = "DATABASE_NAME_NO_COLLISION"
        query_list.extend(
            create_database(database_name, "Latin1_General_CS_AS"))

        schema_name = "schema_name"
        query_list.extend(create_schema(database_name, schema_name))

        table_name = "table_name"

        cls.EXPECTED_METADATA['{}_{}_{}'.format(database_name, schema_name,
                                                table_name)] = {
                                                    'is-view': False,
                                                    'schema-name': schema_name,
                                                    'row-count': 0,
                                                    'values': [(0, 9),
                                                               (1, 10)],
                                                    'table-key-properties':
                                                    {'pk'},
                                                    'selected': None,
                                                    'database-name':
                                                    database_name,
                                                    'stream_name': table_name,
                                                    'fields': fields,
                                                    'schema': schema
                                                }

        column_name = ["pk", "column_name"]
        column_type = ["int", "int"]
        primary_key = {"pk"}
        column_def = [" ".join(x) for x in list(zip(column_name, column_type))]
        query_list.extend(
            create_table(database_name,
                         schema_name,
                         table_name,
                         column_def,
                         primary_key=primary_key))
        query_list.extend(
            insert(
                database_name, schema_name, table_name,
                cls.EXPECTED_METADATA['{}_{}_{}'.format(
                    database_name, schema_name, table_name)]["values"]))

        table_name = "TABLE_NAME"

        cls.EXPECTED_METADATA['{}_{}_{}'.format(database_name, schema_name,
                                                table_name)] = {
                                                    'is-view': False,
                                                    'schema-name': schema_name,
                                                    'row-count': 0,
                                                    'values': [(0, 11),
                                                               (1, 12)],
                                                    'table-key-properties':
                                                    {'pk'},
                                                    'selected': None,
                                                    'database-name':
                                                    database_name,
                                                    'stream_name': table_name,
                                                    'fields': fields,
                                                    'schema': schema
                                                }

        column_name = ["pk", "column_name"]
        column_type = ["int", "int"]
        primary_key = {"pk"}
        column_def = [" ".join(x) for x in list(zip(column_name, column_type))]
        query_list.extend(
            create_table(database_name,
                         schema_name,
                         table_name,
                         column_def,
                         primary_key=primary_key))
        query_list.extend(
            insert(
                database_name, schema_name, table_name,
                cls.EXPECTED_METADATA['{}_{}_{}'.format(
                    database_name, schema_name, table_name)]["values"]))

        schema_name = "SCHEMA_NAME"
        query_list.extend(create_schema(database_name, schema_name))

        table_name = "table_name"

        cls.EXPECTED_METADATA['{}_{}_{}'.format(database_name, schema_name,
                                                table_name)] = {
                                                    'is-view': False,
                                                    'schema-name': schema_name,
                                                    'row-count': 0,
                                                    'values': [(0, 13),
                                                               (1, 14)],
                                                    'table-key-properties':
                                                    {'pk'},
                                                    'selected': None,
                                                    'database-name':
                                                    database_name,
                                                    'stream_name': table_name,
                                                    'fields': fields,
                                                    'schema': schema
                                                }

        column_name = ["pk", "column_name"]
        column_type = ["int", "int"]
        primary_key = {"pk"}
        column_def = [" ".join(x) for x in list(zip(column_name, column_type))]
        query_list.extend(
            create_table(database_name,
                         schema_name,
                         table_name,
                         column_def,
                         primary_key=primary_key))
        query_list.extend(
            insert(
                database_name, schema_name, table_name,
                cls.EXPECTED_METADATA['{}_{}_{}'.format(
                    database_name, schema_name, table_name)]["values"]))

        table_name = "TABLE_NAME"

        cls.EXPECTED_METADATA['{}_{}_{}'.format(database_name, schema_name,
                                                table_name)] = {
                                                    'is-view': False,
                                                    'schema-name': schema_name,
                                                    'row-count': 0,
                                                    'values': [(0, 15),
                                                               (1, 16)],
                                                    'table-key-properties':
                                                    {'pk'},
                                                    'selected': None,
                                                    'database-name':
                                                    database_name,
                                                    'stream_name': table_name,
                                                    'fields': fields,
                                                    'schema': schema
                                                }

        column_name = ["pk", "column_name"]
        column_type = ["int", "int"]
        primary_key = {"pk"}
        column_def = [" ".join(x) for x in list(zip(column_name, column_type))]
        query_list.extend(
            create_table(database_name,
                         schema_name,
                         table_name,
                         column_def,
                         primary_key=primary_key))
        query_list.extend(
            insert(
                database_name, schema_name, table_name,
                cls.EXPECTED_METADATA['{}_{}_{}'.format(
                    database_name, schema_name, table_name)]["values"]))

        mssql_cursor_context_manager(*query_list)
        cls.expected_metadata = cls.discovery_expected_metadata
예제 #14
0
    def test_run(self):
        """stream_expected_data[self.VALUES]
        Verify that a full sync can send capture all data and send it in the correct format
        for integer and boolean (bit) data.
        Verify that the fist sync sends an activate immediately.
        Verify that the table version is incremented up
        """
        print("running test {}".format(self.name()))

        conn_id = self.create_connection()

        # run in check mode
        check_job_name = runner.run_check_mode(self, conn_id)

        # verify check  exit codes
        exit_status = menagerie.get_exit_status(conn_id, check_job_name)
        menagerie.verify_check_exit_status(self, exit_status, check_job_name)

        # get the catalog information of discovery
        found_catalogs = menagerie.get_catalogs(conn_id)
        additional_md = [{"breadcrumb": [], "metadata": {'replication-method': 'INCREMENTAL',
                                                         'replication-key': 'replication_key_column'}}]

        non_selected_properties = []

        BaseTapTest.select_all_streams_and_fields(conn_id, found_catalogs, additional_md=additional_md)

        # clear state
        menagerie.set_state(conn_id, {})
        sync_job_name = runner.run_sync_mode(self, conn_id)

        # verify tap and target exit codes
        exit_status = menagerie.get_exit_status(conn_id, sync_job_name)
        menagerie.verify_sync_exit_status(self, exit_status, sync_job_name)

        # verify record counts of streams
        record_count_by_stream = runner.examine_target_output_file(
            self, conn_id, self.expected_streams(), self.expected_primary_keys_by_stream_id())
        expected_count = {k: len(v['values']) for k, v in self.expected_metadata().items()}
        self.assertEqual(record_count_by_stream, expected_count)

        # verify records match on the first sync
        records_by_stream = runner.get_records_from_target_output()

        table_version = dict()
        for stream in self.expected_streams():
            with self.subTest(stream=stream):
                stream_expected_data = self.expected_metadata()[stream]
                table_version[stream] = records_by_stream[stream]['table_version']

                # verify on the first sync you get
                # activate version message before and after all data for the full table
                # and before the logical replication part
                self.assertEqual(
                    records_by_stream[stream]['messages'][0]['action'],
                    'activate_version')
                self.assertEqual(
                    records_by_stream[stream]['messages'][-1]['action'],
                    'activate_version')
                self.assertTrue(
                    all([m["action"] == "upsert" for m in records_by_stream[stream]['messages'][1:-1]]),
                    msg="Expect all but the first message to be upserts")
                self.assertEqual(len(records_by_stream[stream]['messages'][1:-1]),
                                 len(stream_expected_data[self.VALUES]),
                                 msg="incorrect number of upserts")

                column_names = [
                    list(field_data.keys())[0] for field_data in stream_expected_data[self.FIELDS]
                ]
                replication_column = column_names.index("replication_key_column")
                expected_messages = [
                    {
                        "action": "upsert", "data":
                        {
                            column: value for column, value
                            in list(zip(column_names, row_values))
                            if column not in non_selected_properties
                        }
                    } for row_values in sorted(stream_expected_data[self.VALUES],
                                               key=lambda row: (row[replication_column] is not None, row[replication_column]))
                ]

                # Verify all data is correct for incremental
                for expected_row, actual_row in list(
                        zip(expected_messages, records_by_stream[stream]['messages'][1:-1])):
                    with self.subTest(expected_row=expected_row):
                        self.assertEqual(actual_row["action"], "upsert")
                        self.assertEqual(len(expected_row["data"].keys()), len(actual_row["data"].keys()),
                                         msg="there are not the same number of columns")
                        for column_name, expected_value in expected_row["data"].items():
                            self.assertEqual(expected_value, actual_row["data"][column_name],
                                             msg="expected: {} != actual {}".format(
                                                 expected_row, actual_row))
                print("records are correct for stream {}".format(stream))

                # verify state and bookmarks
                state = menagerie.get_state(conn_id)
                bookmark = state['bookmarks'][stream]

                self.assertIsNone(state.get('currently_syncing'), msg="expected state's currently_syncing to be None")
                self.assertIsNone(bookmark.get('current_log_version'), msg="no log_version for incremental")
                self.assertIsNone(bookmark.get('initial_full_table_complete'), msg="no full table for incremental")
                # find the max value of the replication key
                self.assertEqual(bookmark['replication_key_value'],
                                 max([row[replication_column] for row in stream_expected_data[self.VALUES]
                                      if row[replication_column] is not None]))
                # self.assertEqual(bookmark['replication_key'], 'replication_key_column')

                self.assertEqual(bookmark['version'], table_version[stream],
                                 msg="expected bookmark for stream to match version")

                expected_schemas = self.expected_metadata()[stream]['schema']
                self.assertEqual(records_by_stream[stream]['schema'],
                                 expected_schemas,
                                 msg="expected: {} != actual: {}".format(expected_schemas,
                                                                         records_by_stream[stream]['schema']))

        # ----------------------------------------------------------------------
        # invoke the sync job AGAIN and after insert, update, delete or rows
        # ----------------------------------------------------------------------

        database_name = "constraints_database"
        schema_name = "dbo"
        table_name = "no_constraints"
        column_name = ["replication_key_column"]
        insert_value = [(49, )]
        update_value = [(3, )]
        delete_value = [(0, )]
        query_list = (insert(database_name, schema_name, table_name, insert_value))
        query_list.extend(delete_by_pk(database_name, schema_name, table_name, delete_value, column_name))
        query_list.extend([
            "UPDATE constraints_database.dbo.no_constraints "
            "SET replication_key_column = 3 "
            "WHERE replication_key_column = 1"])
        mssql_cursor_context_manager(*query_list)
        self.EXPECTED_METADATA["constraints_database_dbo_no_constraints"]["values"] = \
            [(2, )] + insert_value + update_value

        database_name = "constraints_database"
        schema_name = "dbo"
        table_name = "multiple_column_pk"
        column_name = ["first_name", "last_name", "replication_key_column"]
        insert_value = [("Brian", "Lampkin", 72)]
        update_value = [("Sergey", "Brin", 65)]
        delete_value = [("Larry", "Page")]
        query_list = (insert(database_name, schema_name, table_name, insert_value))
        query_list.extend(delete_by_pk(database_name, schema_name, table_name, delete_value, column_name[:2]))
        query_list.extend(update_by_pk(database_name, schema_name, table_name, update_value, column_name))
        mssql_cursor_context_manager(*query_list)
        self.EXPECTED_METADATA["constraints_database_dbo_multiple_column_pk"]["values"] = \
            [("Tim", "Berners-Lee", 64)] + insert_value + update_value

        # duplicative of other testing
        # table_name = "single_column_pk"
        # column_name = ["pk", "replication_key_column"]
        # insert_value = [(3, 49)]
        # update_value = [(1, 65)]
        # delete_value = [(0,)]
        # query_list = (insert(database_name, schema_name, table_name, insert_value))
        # query_list.extend(delete_by_pk(database_name, schema_name, table_name, delete_value, column_name[:1]))
        # query_list.extend(update_by_pk(database_name, schema_name, table_name, update_value, column_name))
        # mssql_cursor_context_manager(*query_list)
        # insert_value = [insert_value[0] + (None,)]
        # update_value = [update_value[0] + (None,)]
        # delete_value = [delete_value[0] + (None, datetime.utcnow())]
        # self.EXPECTED_METADATA["constraints_database_dbo_single_column_pk"]["values"] = \
        #     insert_value + delete_value + update_value

        table_name = "pk_with_fk"
        column_name = ["pk", "replication_key_column"]
        insert_value = [(5, 2), (6, None)]
        delete_value = [(1,), (2,)]
        query_list = (insert(database_name, schema_name, table_name, insert_value))
        query_list.extend(delete_by_pk(database_name, schema_name, table_name, delete_value, column_name[:1]))
        mssql_cursor_context_manager(*query_list)
        self.EXPECTED_METADATA["constraints_database_dbo_pk_with_fk"]["values"] = \
           [(0, 1), (3, 1)] + insert_value[:-1]

        table_name = "pk_with_unique_not_null"
        column_name = ["pk", "replication_key_column"]
        insert_value = [(3, 49)]
        update_value = [(1, 65)]
        delete_value = [(0,)]
        query_list = (insert(database_name, schema_name, table_name, insert_value))
        query_list.extend(delete_by_pk(database_name, schema_name, table_name, delete_value, column_name[:1]))
        query_list.extend(update_by_pk(database_name, schema_name, table_name, update_value, column_name))
        mssql_cursor_context_manager(*query_list)
        self.EXPECTED_METADATA["constraints_database_dbo_pk_with_unique_not_null"]["values"] = \
            [(2, 5)] + insert_value + update_value

        # update expected datafor VIEW_WITH_JOIN view
        self.EXPECTED_METADATA["constraints_database_dbo_view_with_join"]["values"] = \
            [(None, None, 4), (2, 5, 5), (None, None, 6)]

        table_name = "default_column"
        column_name = ["pk", "replication_key_column"]
        insert_value = [(3, 49), (4, None), (5, )]
        update_value = [(1, 65)]
        query_list = (insert(database_name, schema_name, table_name, insert_value[:2]))
        query_list.extend(insert(database_name, schema_name, table_name, insert_value[-1:], column_names=column_name[:1]))
        query_list.extend(update_by_pk(database_name, schema_name, table_name, update_value, column_name))
        mssql_cursor_context_manager(*query_list)
        self.EXPECTED_METADATA["constraints_database_dbo_default_column"]["values"] = [
                (0, -1)] + [(3, 49), (5, -1)] + update_value

        table_name = "check_constraint"
        column_name = ["pk", "replication_key_column"]
        insert_value = [(3, 49)]
        update_value = [(1, 65)]
        query_list = (insert(database_name, schema_name, table_name, insert_value))
        query_list.extend(update_by_pk(database_name, schema_name, table_name, update_value, column_name))
        mssql_cursor_context_manager(*query_list)
        self.EXPECTED_METADATA["constraints_database_dbo_check_constraint"]["values"] = \
            [(0, 37)] + insert_value + update_value

        table_name = "even_identity"
        column_name = ["pk", "replication_key_column"]
        insert_value = [(3,)]
        update_value = [(2,)]
        delete_value = [(1,)]
        query_list = (insert(database_name, schema_name, table_name, insert_value, column_names=column_name[:1]))
        query_list.extend(delete_by_pk(database_name, schema_name, table_name, delete_value, column_name[:1]))
        query_list.extend(update_by_pk(database_name, schema_name, table_name, update_value, column_name))
        mssql_cursor_context_manager(*query_list)
        insert_value = [insert_value[0] + (6, )]
        update_value = [update_value[0] + (4, )]
        self.EXPECTED_METADATA["constraints_database_dbo_even_identity"]["values"] = \
            insert_value + update_value

        sync_job_name = runner.run_sync_mode(self, conn_id)

        # verify tap and target exit codes
        exit_status = menagerie.get_exit_status(conn_id, sync_job_name)
        menagerie.verify_sync_exit_status(self, exit_status, sync_job_name)
        record_count_by_stream = runner.examine_target_output_file(
            self, conn_id, self.expected_streams(), self.expected_primary_keys_by_stream_id())
        expected_count = {k: len(v['values']) for k, v in self.expected_metadata().items()}
        self.assertEqual(record_count_by_stream, expected_count)
        records_by_stream = runner.get_records_from_target_output()

        for stream in self.expected_streams():
            with self.subTest(stream=stream):
                stream_expected_data = self.expected_metadata()[stream]
                new_table_version = records_by_stream[stream]['table_version']

                # verify on a subsequent sync you get activate version message only after all data
                self.assertEqual(records_by_stream[stream]['messages'][0]['action'], 'activate_version')
                self.assertEqual(records_by_stream[stream]['messages'][-1]['action'], 'activate_version')
                self.assertTrue(all(
                    [message["action"] == "upsert" for message in records_by_stream[stream]['messages'][1:-1]]
                ))
                self.assertEqual(len(records_by_stream[stream]['messages'][1:-1]),
                                 len(stream_expected_data[self.VALUES]),
                                 msg="incorrect number of upserts")

                column_names = [
                    list(field_data.keys())[0] for field_data in stream_expected_data[self.FIELDS]
                ]
                replication_column = column_names.index("replication_key_column")
                expected_messages = [
                    {
                        "action": "upsert", "data":
                        {
                            column: value for column, value
                            in list(zip(column_names, row_values))
                            if column not in non_selected_properties
                        }
                    } for row_values in sorted(stream_expected_data[self.VALUES],
                                               key=lambda row: (row[replication_column] is not None, row[replication_column]))
                ]

                # remove sequences from actual values for comparison
                [message.pop("sequence") for message
                 in records_by_stream[stream]['messages'][1:-1]]

                # Verify all data is correct
                for expected_row, actual_row in list(
                        zip(expected_messages, records_by_stream[stream]['messages'][1:-1])):
                    with self.subTest(expected_row=expected_row):
                        self.assertEqual(actual_row["action"], "upsert")

                        # we only send the _sdc_deleted_at column for deleted rows
                        self.assertEqual(len(expected_row["data"].keys()), len(actual_row["data"].keys()),
                                         msg="there are not the same number of columns")
                        for column_name, expected_value in expected_row["data"].items():
                            self.assertEqual(expected_value, actual_row["data"][column_name],
                                             msg="expected: {} != actual {}".format(
                                                 expected_row, actual_row))
                        print("records are correct for stream {}".format(stream))

                # verify state and bookmarks
                state = menagerie.get_state(conn_id)
                bookmark = state['bookmarks'][stream]

                self.assertIsNone(state.get('currently_syncing'), msg="expected state's currently_syncing to be None")
                self.assertIsNone(bookmark.get('current_log_version'), msg="no log_version for incremental")
                self.assertIsNone(bookmark.get('initial_full_table_complete'), msg="no full table for incremental")
                # find the max value of the replication key
                self.assertEqual(bookmark['replication_key_value'],
                                 max([row[replication_column] for row in stream_expected_data[self.VALUES]
                                      if row[replication_column] is not None]))
                # self.assertEqual(bookmark['replication_key'], 'replication_key_column')

                self.assertEqual(bookmark['version'], table_version[stream],
                                 msg="expected bookmark for stream to match version")
                self.assertEqual(bookmark['version'], new_table_version,
                                 msg="expected bookmark for stream to match version")

                state = menagerie.get_state(conn_id)
                bookmark = state['bookmarks'][stream]

                expected_schemas = self.expected_metadata()[stream]['schema']
                self.assertEqual(records_by_stream[stream]['schema'],
                                 expected_schemas,
                                 msg="expected: {} != actual: {}".format(expected_schemas,
                                                                         records_by_stream[stream]['schema']))
예제 #15
0
    def test_run(self):
        """stream_expected_data[self.VALUES]
        Verify that a full sync can send capture all data and send it in the correct format
        for integer and boolean (bit) data.
        Verify that the fist sync sends an activate immediately.
        Verify that the table version is incremented up
        """
        print("running test {}".format(self.name()))

        conn_id = self.create_connection()

        # run in check mode
        check_job_name = runner.run_check_mode(self, conn_id)

        # verify check  exit codes
        exit_status = menagerie.get_exit_status(conn_id, check_job_name)
        menagerie.verify_check_exit_status(self, exit_status, check_job_name)

        # get the catalog information of discovery
        found_catalogs = menagerie.get_catalogs(conn_id)
        additional_md = [{
            "breadcrumb": [],
            "metadata": {
                'replication-method': 'INCREMENTAL',
                'replication-key': 'replication_key_column'
            }
        }]

        BaseTapTest.select_all_streams_and_fields(conn_id,
                                                  found_catalogs,
                                                  additional_md=additional_md)

        # clear state
        menagerie.set_state(conn_id, {})
        sync_job_name = runner.run_sync_mode(self, conn_id)

        # verify tap and target exit codes
        exit_status = menagerie.get_exit_status(conn_id, sync_job_name)
        menagerie.verify_sync_exit_status(self, exit_status, sync_job_name)

        # verify record counts of streams
        record_count_by_stream = runner.examine_target_output_file(
            self, conn_id, self.expected_streams(),
            self.expected_primary_keys_by_stream_id())
        expected_count = {
            k: len(v['values'])
            for k, v in self.expected_metadata().items()
        }
        self.assertEqual(record_count_by_stream, expected_count)

        # verify records match on the first sync
        records_by_stream = runner.get_records_from_target_output()

        non_selected_properties = []

        table_version = dict()
        for stream in self.expected_streams():
            with self.subTest(stream=stream):
                stream_expected_data = self.expected_metadata()[stream]
                table_version[stream] = records_by_stream[stream][
                    'table_version']

                # verify on the first sync you get
                # activate version message before and after all data for the full table
                # and before the logical replication part
                self.assertEqual(
                    records_by_stream[stream]['messages'][0]['action'],
                    'activate_version')
                self.assertEqual(
                    records_by_stream[stream]['messages'][-1]['action'],
                    'activate_version')
                self.assertTrue(
                    all([
                        m["action"] == "upsert"
                        for m in records_by_stream[stream]['messages'][1:-1]
                    ]),
                    msg="Expect all but the first message to be upserts")
                self.assertEqual(len(
                    records_by_stream[stream]['messages'][1:-1]),
                                 len(stream_expected_data[self.VALUES]),
                                 msg="incorrect number of upserts")

                column_names = [
                    list(field_data.keys())[0]
                    for field_data in stream_expected_data[self.FIELDS]
                ]

                expected_messages = [{
                    "action": "upsert",
                    "data": {
                        column: value
                        for column, value in list(zip(column_names,
                                                      row_values))
                        if column not in non_selected_properties
                    }
                } for row_values in sorted(stream_expected_data[self.VALUES],
                                           key=lambda row:
                                           (row[1] is not None, row[1]))]

                # Verify all data is correct for incremental
                for expected_row, actual_row in list(
                        zip(expected_messages,
                            records_by_stream[stream]['messages'][1:-1])):
                    with self.subTest(expected_row=expected_row):
                        self.assertEqual(actual_row["action"], "upsert")
                        self.assertEqual(
                            len(expected_row["data"].keys()),
                            len(actual_row["data"].keys()),
                            msg="there are not the same number of columns")
                        for column_name, expected_value in expected_row[
                                "data"].items():
                            if isinstance(expected_value, datetime):
                                # sql server only keeps milliseconds not microseconds
                                self.assertEqual(
                                    expected_value.isoformat().replace(
                                        '000+00:00',
                                        'Z').replace('+00:00', 'Z'),
                                    actual_row["data"][column_name],
                                    msg="expected: {} != actual {}".format(
                                        expected_value.isoformat().replace(
                                            '000+00:00',
                                            'Z').replace('+00:00', 'Z'),
                                        actual_row["data"][column_name]))
                            elif isinstance(expected_value, time):
                                # sql server time has second resolution only
                                self.assertEqual(
                                    expected_value.replace(
                                        microsecond=0).isoformat().replace(
                                            '+00:00', ''),
                                    actual_row["data"][column_name],
                                    msg="expected: {} != actual {}".format(
                                        expected_value.isoformat().replace(
                                            '+00:00', 'Z'),
                                        actual_row["data"][column_name]))
                            elif isinstance(expected_value, date):
                                # sql server time has second resolution only
                                self.assertEqual(
                                    expected_value.isoformat() +
                                    'T00:00:00+00:00',
                                    actual_row["data"][column_name],
                                    msg="expected: {} != actual {}".format(
                                        expected_value.isoformat() +
                                        'T00:00:00+00:00',
                                        actual_row["data"][column_name]))
                            else:
                                self.assertEqual(
                                    expected_value,
                                    actual_row["data"][column_name],
                                    msg="expected: {} != actual {}".format(
                                        expected_value,
                                        actual_row["data"][column_name]))
                print("records are correct for stream {}".format(stream))

                # verify state and bookmarks
                state = menagerie.get_state(conn_id)
                bookmark = state['bookmarks'][stream]

                self.assertIsNone(
                    state.get('currently_syncing'),
                    msg="expected state's currently_syncing to be None")
                self.assertIsNone(bookmark.get('current_log_version'),
                                  msg="no log_version for incremental")
                self.assertIsNone(bookmark.get('initial_full_table_complete'),
                                  msg="no full table for incremental")
                # find the max value of the replication key
                expected_bookmark = max([
                    row[1] for row in stream_expected_data[self.VALUES]
                    if row[1] is not None
                ])
                self.assertEqual(bookmark['replication_key_value'],
                                 expected_bookmark.isoformat())
                # self.assertEqual(bookmark['replication_key'], 'replication_key_value')

                self.assertEqual(
                    bookmark['version'],
                    table_version[stream],
                    msg="expected bookmark for stream to match version")

                expected_schemas = self.expected_metadata()[stream]['schema']
                self.assertEqual(records_by_stream[stream]['schema'],
                                 expected_schemas,
                                 msg="expected: {} != actual: {}".format(
                                     expected_schemas,
                                     records_by_stream[stream]['schema']))

        # ----------------------------------------------------------------------
        # invoke the sync job AGAIN and after insert, update, delete or rows
        # ----------------------------------------------------------------------

        database_name = "data_types_database"
        schema_name = "dbo"
        table_name = "dates_and_times"
        column_name = [
            "pk", "replication_key_column", "date_and_time",
            "bigger_range_and_precision_datetime", "datetime_with_timezones",
            "datetime_no_seconds", "its_time"
        ]
        insert_value = [
            (5, date(9999, 12, 30),
             datetime(9999, 12, 31, 23, 59, 59, 997000, tzinfo=timezone.utc),
             datetime(9999, 12, 31, 23, 59, 59, 999000, tzinfo=timezone.utc),
             datetime(9999,
                      12,
                      31,
                      10,
                      14,
                      tzinfo=timezone(timedelta(hours=14))).isoformat(),
             datetime(2079, 6, 6, 23, 59, tzinfo=timezone.utc),
             time(23, 59, 59, tzinfo=timezone.utc)),
            (6, date(2018, 12, 29),
             datetime(9999, 12, 31, 23, 59, 59, 997000, tzinfo=timezone.utc),
             datetime(9999, 12, 31, 23, 59, 59, 999000, tzinfo=timezone.utc),
             datetime(9999,
                      12,
                      31,
                      10,
                      14,
                      tzinfo=timezone(timedelta(hours=14))).isoformat(),
             datetime(2079, 6, 6, 23, 59, tzinfo=timezone.utc),
             time(23, 59, 59, tzinfo=timezone.utc))
        ]
        update_value = [
            (3, date(9999, 12, 31),
             datetime(9999, 12, 31, 23, 59, 59, 997000, tzinfo=timezone.utc),
             datetime(9999, 12, 31, 23, 59, 59, 999000, tzinfo=timezone.utc),
             datetime(9999,
                      12,
                      31,
                      10,
                      14,
                      tzinfo=timezone(timedelta(hours=10))).isoformat(),
             datetime(2079, 6, 6, 23, 59, tzinfo=timezone.utc),
             time(23, 59, 59, tzinfo=timezone.utc)),
            (4, date(2018, 12, 30),
             datetime(9999, 12, 31, 23, 59, 59, 997000, tzinfo=timezone.utc),
             datetime(9999, 12, 31, 23, 59, 59, 999000, tzinfo=timezone.utc),
             datetime(9999,
                      12,
                      31,
                      10,
                      14,
                      tzinfo=timezone(timedelta(hours=6))).isoformat(),
             datetime(2079, 6, 6, 23, 59, tzinfo=timezone.utc),
             time(23, 59, 59, tzinfo=timezone.utc))
        ]
        delete_value = [(2, )]
        query_list = (insert(database_name, schema_name, table_name,
                             insert_value))
        query_list.extend(
            delete_by_pk(database_name, schema_name, table_name, delete_value,
                         column_name[:1]))
        query_list.extend(
            update_by_pk(database_name, schema_name, table_name, update_value,
                         column_name))
        mssql_cursor_context_manager(*query_list)

        insert_value = [
            (5, date(9999, 12, 30),
             datetime(9999, 12, 31, 23, 59, 59, 997000, tzinfo=timezone.utc),
             datetime(9999, 12, 31, 23, 59, 59, 999000, tzinfo=timezone.utc),
             datetime(9999,
                      12,
                      31,
                      10,
                      14,
                      tzinfo=timezone(timedelta(hours=14))).astimezone(
                          timezone.utc),
             datetime(2079, 6, 6, 23, 59, tzinfo=timezone.utc),
             time(23, 59, 59, tzinfo=timezone.utc)),
            (6, date(2018, 12, 29),
             datetime(9999, 12, 31, 23, 59, 59, 997000, tzinfo=timezone.utc),
             datetime(9999, 12, 31, 23, 59, 59, 999000, tzinfo=timezone.utc),
             datetime(9999,
                      12,
                      31,
                      10,
                      14,
                      tzinfo=timezone(timedelta(hours=14))).astimezone(
                          timezone.utc),
             datetime(2079, 6, 6, 23, 59, tzinfo=timezone.utc),
             time(23, 59, 59, tzinfo=timezone.utc))
        ]
        update_value = [
            (3, date(9999, 12, 31),
             datetime(9999, 12, 31, 23, 59, 59, 997000, tzinfo=timezone.utc),
             datetime(9999, 12, 31, 23, 59, 59, 999000, tzinfo=timezone.utc),
             datetime(9999,
                      12,
                      31,
                      10,
                      14,
                      tzinfo=timezone(timedelta(hours=10))).astimezone(
                          timezone.utc),
             datetime(2079, 6, 6, 23, 59, tzinfo=timezone.utc),
             time(23, 59, 59, tzinfo=timezone.utc)),
            (4, date(2018, 12, 30),
             datetime(9999, 12, 31, 23, 59, 59, 997000, tzinfo=timezone.utc),
             datetime(9999, 12, 31, 23, 59, 59, 999000, tzinfo=timezone.utc),
             datetime(9999,
                      12,
                      31,
                      10,
                      14,
                      tzinfo=timezone(timedelta(hours=6))).astimezone(
                          timezone.utc),
             datetime(2079, 6, 6, 23, 59, tzinfo=timezone.utc),
             time(23, 59, 59, tzinfo=timezone.utc))
        ]

        insert_value = insert_value[:-1]  # only repl_key >= gets included
        update_value = update_value[:-1]
        self.EXPECTED_METADATA["data_types_database_dbo_dates_and_times"][
            "values"] = [(
                1, date(9999, 12, 29),
                datetime(9999, 12, 31, 23, 59, 59, 997000,
                         tzinfo=timezone.utc),
                datetime(9999, 12, 31, 23, 59, 59, 999000,
                         tzinfo=timezone.utc),
                datetime(
                    9999, 12, 31, 10, 14, tzinfo=timezone(
                        timedelta(hours=14))).astimezone(timezone.utc),
                datetime(2079, 6, 6, 23, 59, tzinfo=timezone.utc),
                time(23, 59, 59, tzinfo=timezone.utc))
                         ] + update_value + insert_value

        sync_job_name = runner.run_sync_mode(self, conn_id)

        # verify tap and target exit codes
        exit_status = menagerie.get_exit_status(conn_id, sync_job_name)
        menagerie.verify_sync_exit_status(self, exit_status, sync_job_name)
        record_count_by_stream = runner.examine_target_output_file(
            self, conn_id, self.expected_streams(),
            self.expected_primary_keys_by_stream_id())
        expected_count = {
            k: len(v['values'])
            for k, v in self.expected_metadata().items()
        }
        self.assertEqual(record_count_by_stream, expected_count)
        records_by_stream = runner.get_records_from_target_output()

        for stream in self.expected_streams():
            with self.subTest(stream=stream):
                stream_expected_data = self.expected_metadata()[stream]
                new_table_version = records_by_stream[stream]['table_version']

                # verify on a subsequent sync you get activate version message only after all data
                self.assertEqual(
                    records_by_stream[stream]['messages'][0]['action'],
                    'activate_version')
                self.assertEqual(
                    records_by_stream[stream]['messages'][-1]['action'],
                    'activate_version')
                self.assertTrue(
                    all([
                        message["action"] == "upsert" for message in
                        records_by_stream[stream]['messages'][1:-1]
                    ]))
                self.assertEqual(len(
                    records_by_stream[stream]['messages'][1:-1]),
                                 len(stream_expected_data[self.VALUES]),
                                 msg="incorrect number of upserts")

                column_names = [
                    list(field_data.keys())[0]
                    for field_data in stream_expected_data[self.FIELDS]
                ]

                expected_messages = [{
                    "action": "upsert",
                    "data": {
                        column: value
                        for column, value in list(zip(column_names,
                                                      row_values))
                        if column not in non_selected_properties
                    }
                } for row_values in sorted(stream_expected_data[self.VALUES],
                                           key=lambda row:
                                           (row[1] is not None, row[1]))]

                # remove sequences from actual values for comparison
                [
                    message.pop("sequence")
                    for message in records_by_stream[stream]['messages'][1:-1]
                ]

                # Verify all data is correct
                for expected_row, actual_row in list(
                        zip(expected_messages,
                            records_by_stream[stream]['messages'][1:-1])):
                    with self.subTest(expected_row=expected_row):
                        self.assertEqual(actual_row["action"], "upsert")

                        # we only send the _sdc_deleted_at column for deleted rows
                        self.assertEqual(
                            len(expected_row["data"].keys()),
                            len(actual_row["data"].keys()),
                            msg="there are not the same number of columns")
                        for column_name, expected_value in expected_row[
                                "data"].items():
                            if isinstance(expected_value, datetime):
                                # sql server only keeps milliseconds not microseconds
                                self.assertEqual(
                                    expected_value.isoformat().replace(
                                        '000+00:00',
                                        'Z').replace('+00:00', 'Z'),
                                    actual_row["data"][column_name],
                                    msg="expected: {} != actual {}".format(
                                        expected_value.isoformat().replace(
                                            '000+00:00',
                                            'Z').replace('+00:00', 'Z'),
                                        actual_row["data"][column_name]))
                            elif isinstance(expected_value, time):
                                # sql server time has second resolution only
                                self.assertEqual(
                                    expected_value.replace(
                                        microsecond=0).isoformat().replace(
                                            '+00:00', ''),
                                    actual_row["data"][column_name],
                                    msg="expected: {} != actual {}".format(
                                        expected_value.isoformat().replace(
                                            '+00:00', 'Z'),
                                        actual_row["data"][column_name]))
                            elif isinstance(expected_value, date):
                                # sql server time has second resolution only
                                self.assertEqual(
                                    expected_value.isoformat() +
                                    'T00:00:00+00:00',
                                    actual_row["data"][column_name],
                                    msg="expected: {} != actual {}".format(
                                        expected_value.isoformat() +
                                        'T00:00:00+00:00',
                                        actual_row["data"][column_name]))
                            else:
                                self.assertEqual(
                                    expected_value,
                                    actual_row["data"][column_name],
                                    msg="expected: {} != actual {}".format(
                                        expected_value,
                                        actual_row["data"][column_name]))
                print("records are correct for stream {}".format(stream))

                # verify state and bookmarks
                state = menagerie.get_state(conn_id)
                bookmark = state['bookmarks'][stream]

                self.assertIsNone(
                    state.get('currently_syncing'),
                    msg="expected state's currently_syncing to be None")
                self.assertIsNone(bookmark.get('current_log_version'),
                                  msg="no log_version for incremental")
                self.assertIsNone(bookmark.get('initial_full_table_complete'),
                                  msg="no full table for incremental")
                # find the max value of the replication key
                expected_bookmark = max([
                    row[1] for row in stream_expected_data[self.VALUES]
                    if row[1] is not None
                ])
                self.assertEqual(bookmark['replication_key_value'],
                                 expected_bookmark.isoformat())
                # self.assertEqual(bookmark['replication_key'], 'replication_key_value')

                self.assertEqual(
                    bookmark['version'],
                    table_version[stream],
                    msg="expected bookmark for stream to match version")
                self.assertEqual(
                    bookmark['version'],
                    new_table_version,
                    msg="expected bookmark for stream to match version")

                state = menagerie.get_state(conn_id)
                bookmark = state['bookmarks'][stream]

                expected_schemas = self.expected_metadata()[stream]['schema']
                self.assertEqual(records_by_stream[stream]['schema'],
                                 expected_schemas,
                                 msg="expected: {} != actual: {}".format(
                                     expected_schemas,
                                     records_by_stream[stream]['schema']))
예제 #16
0
    def test_run(self):
        """
        Verify that a full sync can send capture all data and send it in the correct format
        for integer and boolean (bit) data.
        Verify that the fist sync sends an activate immediately.
        Verify that the table version is incremented up
        """

        print("running test {}".format(self.name()))

        conn_id = self.create_connection()

        # run in check mode
        check_job_name = runner.run_check_mode(self, conn_id)

        # verify check  exit codes
        exit_status = menagerie.get_exit_status(conn_id, check_job_name)
        menagerie.verify_check_exit_status(self, exit_status, check_job_name)

        # get the catalog information of discovery
        found_catalogs = menagerie.get_catalogs(conn_id)
        additional_md = [{
            "breadcrumb": [],
            "metadata": {
                'replication-method': 'LOG_BASED'
            }
        }]
        BaseTapTest.select_all_streams_and_fields(conn_id,
                                                  found_catalogs,
                                                  additional_md=additional_md)

        # clear state
        menagerie.set_state(conn_id, {})
        sync_job_name = runner.run_sync_mode(self, conn_id)

        # verify tap and target exit codes
        exit_status = menagerie.get_exit_status(conn_id, sync_job_name)
        menagerie.verify_sync_exit_status(self, exit_status, sync_job_name)

        # verify record counts of streams
        record_count_by_stream = runner.examine_target_output_file(
            self, conn_id, self.expected_streams(),
            self.expected_primary_keys_by_stream_id())
        expected_count = {
            k: len(v['values'])
            for k, v in self.expected_metadata().items()
        }
        # self.assertEqual(record_count_by_stream, expected_count)

        # verify records match on the first sync
        records_by_stream = runner.get_records_from_target_output()

        table_version = dict()
        for stream in self.expected_streams():
            with self.subTest(stream=stream):
                stream_expected_data = self.expected_metadata()[stream]
                table_version[stream] = records_by_stream[stream][
                    'table_version']

                # verify on the first sync you get
                # activate version message before and after all data for the full table
                # and before the logical replication part
                if records_by_stream[stream]['messages'][-1].get("data"):
                    last_row_data = True
                else:
                    last_row_data = False

                self.assertEqual(
                    records_by_stream[stream]['messages'][0]['action'],
                    'activate_version')
                self.assertEqual(
                    records_by_stream[stream]['messages'][-2]['action'],
                    'activate_version')
                if last_row_data:
                    self.assertEqual(
                        records_by_stream[stream]['messages'][-3]['action'],
                        'activate_version')
                else:
                    self.assertEqual(
                        records_by_stream[stream]['messages'][-1]['action'],
                        'activate_version')
                self.assertEqual(
                    len([
                        m for m in records_by_stream[stream]['messages'][1:]
                        if m["action"] == "activate_version"
                    ]),
                    2,
                    msg=
                    "Expect 2 more activate version messages for end of full table and beginning of log based"
                )

                column_names = [
                    list(field_data.keys())[0]
                    for field_data in stream_expected_data[self.FIELDS]
                ]

                expected_messages = [{
                    "action": "upsert",
                    "data": {
                        column: value
                        for column, value in list(
                            zip(column_names, stream_expected_data[self.VALUES]
                                [row]))
                    }
                } for row in range(len(stream_expected_data[self.VALUES]))]

                # Verify all data is correct for the full table part
                if last_row_data:
                    final_row = -3
                else:
                    final_row = -2

                for expected_row, actual_row in list(
                        zip(expected_messages, records_by_stream[stream]
                            ['messages'][1:final_row])):
                    with self.subTest(expected_row=expected_row):

                        self.assertEqual(actual_row["action"], "upsert")
                        self.assertEqual(
                            len(expected_row["data"].keys()),
                            len(actual_row["data"].keys()),
                            msg="there are not the same number of columns")
                        for column_name, expected_value in expected_row[
                                "data"].items():
                            if isinstance(expected_value, datetime):
                                # sql server only keeps milliseconds not microseconds
                                self.assertEqual(
                                    expected_value.isoformat().replace(
                                        '000+00:00',
                                        'Z').replace('+00:00', 'Z'),
                                    actual_row["data"][column_name],
                                    msg="expected: {} != actual {}".format(
                                        expected_value.isoformat().replace(
                                            '000+00:00',
                                            'Z').replace('+00:00', 'Z'),
                                        actual_row["data"][column_name]))
                            elif isinstance(expected_value, time):
                                # sql server time has second resolution only
                                self.assertEqual(
                                    expected_value.replace(
                                        microsecond=0).isoformat().replace(
                                            '+00:00', ''),
                                    actual_row["data"][column_name],
                                    msg="expected: {} != actual {}".format(
                                        expected_value.isoformat().replace(
                                            '+00:00', 'Z'),
                                        actual_row["data"][column_name]))
                            elif isinstance(expected_value, date):
                                # sql server time has second resolution only
                                self.assertEqual(
                                    expected_value.isoformat() +
                                    'T00:00:00+00:00',
                                    actual_row["data"][column_name],
                                    msg="expected: {} != actual {}".format(
                                        expected_value.isoformat() +
                                        'T00:00:00+00:00',
                                        actual_row["data"][column_name]))
                            else:
                                self.assertEqual(
                                    expected_value,
                                    actual_row["data"][column_name],
                                    msg="expected: {} != actual {}".format(
                                        expected_value,
                                        actual_row["data"][column_name]))

                # Verify all data is correct for the log replication part if sent
                if records_by_stream[stream]['messages'][-1].get("data"):
                    for column_name, expected_value in expected_messages[-1][
                            "data"].items():
                        if isinstance(expected_value, datetime):
                            # sql server only keeps milliseconds not microseconds
                            self.assertEqual(
                                expected_value.isoformat().replace(
                                    '000+00:00', 'Z').replace('+00:00', 'Z'),
                                actual_row["data"][column_name],
                                msg="expected: {} != actual {}".format(
                                    expected_value.isoformat().replace(
                                        '000+00:00',
                                        'Z').replace('+00:00', 'Z'),
                                    actual_row["data"][column_name]))
                        elif isinstance(expected_value, time):
                            # sql server time has second resolution only
                            self.assertEqual(
                                expected_value.replace(
                                    microsecond=0).isoformat().replace(
                                        '+00:00', ''),
                                actual_row["data"][column_name],
                                msg="expected: {} != actual {}".format(
                                    expected_value.isoformat().replace(
                                        '+00:00', 'Z'),
                                    actual_row["data"][column_name]))
                        elif isinstance(expected_value, date):
                            # sql server time has second resolution only
                            self.assertEqual(
                                expected_value.isoformat() + 'T00:00:00+00:00',
                                actual_row["data"][column_name],
                                msg="expected: {} != actual {}".format(
                                    expected_value.isoformat() +
                                    'T00:00:00+00:00',
                                    actual_row["data"][column_name]))
                        else:
                            self.assertEqual(
                                expected_value,
                                actual_row["data"][column_name],
                                msg="expected: {} != actual {}".format(
                                    expected_value,
                                    actual_row["data"][column_name]))

                print("records are correct for stream {}".format(stream))

                # verify state and bookmarks
                state = menagerie.get_state(conn_id)
                bookmark = state['bookmarks'][stream]

                self.assertIsNone(
                    state.get('currently_syncing'),
                    msg="expected state's currently_syncing to be None")
                self.assertIsNotNone(
                    bookmark.get('current_log_version'),
                    msg=
                    "expected bookmark to have current_log_version because we are using log replication"
                )
                self.assertTrue(bookmark['initial_full_table_complete'],
                                msg="expected full table to be complete")
                inital_log_version = bookmark['current_log_version']

                self.assertEqual(
                    bookmark['version'],
                    table_version[stream],
                    msg="expected bookmark for stream to match version")

                expected_schemas = self.expected_metadata()[stream]['schema']
                self.assertEqual(records_by_stream[stream]['schema'],
                                 expected_schemas,
                                 msg="expected: {} != actual: {}".format(
                                     expected_schemas,
                                     records_by_stream[stream]['schema']))

        # ----------------------------------------------------------------------
        # invoke the sync job AGAIN and after insert, update, delete or rows
        # ----------------------------------------------------------------------

        database_name = "data_types_database"
        schema_name = "dbo"
        table_name = "dates_and_times"
        column_name = [
            "pk", "just_a_date", "date_and_time",
            "bigger_range_and_precision_datetime", "datetime_with_timezones",
            "datetime_no_seconds", "its_time"
        ]
        new_date_value = datetime(2019, 7, 22, 21, 11, 40, 573000)
        insert_value = [
            (6, new_date_value.date(), new_date_value,
             datetime(9085, 4, 30, 21, 52, 57, 492920, tzinfo=timezone.utc),
             datetime(5749,
                      4,
                      3,
                      1,
                      47,
                      47,
                      110809,
                      tzinfo=timezone(timedelta(hours=10,
                                                minutes=5))).isoformat(),
             datetime(2031, 4, 30, 19, 32, tzinfo=timezone.utc),
             time(21, 9, 56, 0, tzinfo=timezone.utc))
        ]
        update_value = [
            (2, new_date_value.date(), new_date_value,
             datetime(9085, 4, 30, 21, 52, 57, 492920, tzinfo=timezone.utc),
             datetime(5749,
                      4,
                      3,
                      1,
                      47,
                      47,
                      110809,
                      tzinfo=timezone(timedelta(hours=10,
                                                minutes=5))).isoformat(),
             datetime(2031, 4, 30, 19, 32, tzinfo=timezone.utc),
             time(21, 9, 56, 0, tzinfo=timezone.utc))
        ]
        delete_value = [(3, )]
        query_list = (insert(database_name, schema_name, table_name,
                             insert_value))
        query_list.extend(
            delete_by_pk(database_name, schema_name, table_name, delete_value,
                         column_name[:1]))
        query_list.extend(
            update_by_pk(database_name, schema_name, table_name, update_value,
                         column_name))
        mssql_cursor_context_manager(*query_list)
        insert_value = [
            (6, new_date_value.date(), new_date_value,
             datetime(9085, 4, 30, 21, 52, 57, 492920, tzinfo=timezone.utc),
             datetime(5749,
                      4,
                      3,
                      1,
                      47,
                      47,
                      110809,
                      tzinfo=timezone(timedelta(
                          hours=10, minutes=5))).astimezone(timezone.utc),
             datetime(2031, 4, 30, 19, 32, tzinfo=timezone.utc),
             time(21, 9, 56, 0, tzinfo=timezone.utc))
        ]
        update_value = [
            (2, new_date_value.date(), new_date_value,
             datetime(9085, 4, 30, 21, 52, 57, 492920, tzinfo=timezone.utc),
             datetime(5749,
                      4,
                      3,
                      1,
                      47,
                      47,
                      110809,
                      tzinfo=timezone(timedelta(
                          hours=10, minutes=5))).astimezone(timezone.utc),
             datetime(2031, 4, 30, 19, 32, tzinfo=timezone.utc),
             time(21, 9, 56, 0, tzinfo=timezone.utc))
        ]
        insert_value = [insert_value[0] + (None, )]
        update_value = [update_value[0] + (None, )]
        delete_value = [(3, None, None, None, None, None, None,
                         datetime.utcnow())]
        self.EXPECTED_METADATA["data_types_database_dbo_dates_and_times"]["values"] =  \
            [self.expected_metadata()["data_types_database_dbo_dates_and_times"]["values"][-1]] + \
            insert_value + delete_value + update_value
        self.EXPECTED_METADATA["data_types_database_dbo_dates_and_times"][
            "fields"].append({
                "_sdc_deleted_at": {
                    'sql-datatype': 'datetime',
                    'selected-by-default': True,
                    'inclusion': 'automatic'
                }
            })

        sync_job_name = runner.run_sync_mode(self, conn_id)

        # verify tap and target exit codes
        exit_status = menagerie.get_exit_status(conn_id, sync_job_name)
        menagerie.verify_sync_exit_status(self, exit_status, sync_job_name)
        record_count_by_stream = runner.examine_target_output_file(
            self, conn_id, self.expected_streams(),
            self.expected_primary_keys_by_stream_id())
        expected_count = {
            k: len(v['values'])
            for k, v in self.expected_metadata().items()
        }
        self.assertEqual(record_count_by_stream, expected_count)
        records_by_stream = runner.get_records_from_target_output()

        for stream in self.expected_streams():
            with self.subTest(stream=stream):
                stream_expected_data = self.expected_metadata()[stream]
                new_table_version = records_by_stream[stream]['table_version']

                # verify on a subsequent sync you get activate version message only after all data
                self.assertEqual(
                    records_by_stream[stream]['messages'][0]['action'],
                    'activate_version')
                self.assertTrue(
                    all([
                        message["action"] == "upsert" for message in
                        records_by_stream[stream]['messages'][1:]
                    ]))

                column_names = [
                    list(field_data.keys())[0]
                    for field_data in stream_expected_data[self.FIELDS]
                ]

                expected_messages = [{
                    "action": "upsert",
                    "data": {
                        column: value
                        for column, value in list(
                            zip(column_names, stream_expected_data[self.VALUES]
                                [row]))
                    }
                } for row in range(len(stream_expected_data[self.VALUES]))]

                # remove sequences from actual values for comparison
                [
                    message.pop("sequence")
                    for message in records_by_stream[stream]['messages'][1:]
                ]

                # Verify all data is correct
                for expected_row, actual_row in list(
                        zip(expected_messages,
                            records_by_stream[stream]['messages'][1:])):
                    with self.subTest(expected_row=expected_row):
                        self.assertEqual(actual_row["action"], "upsert")

                        # we only send the _sdc_deleted_at column for deleted rows
                        self.assertGreaterEqual(
                            len(expected_row["data"].keys()),
                            len(actual_row["data"].keys()),
                            msg="there are not the same number of columns")

                        for column_name, expected_value in expected_row[
                                "data"].items():
                            if column_name != "_sdc_deleted_at":
                                if isinstance(expected_value, datetime):
                                    # sql server only keeps milliseconds not microseconds
                                    self.assertEqual(
                                        expected_value.isoformat().replace(
                                            '000+00:00', 'Z').replace(
                                                '+00:00',
                                                'Z').replace('000', 'Z'),
                                        actual_row["data"][column_name],
                                        msg="expected: {} != actual {}".format(
                                            expected_value.isoformat().replace(
                                                '000+00:00', 'Z').replace(
                                                    '+00:00',
                                                    'Z').replace('000', 'Z'),
                                            actual_row["data"][column_name]))
                                elif isinstance(expected_value, time):
                                    # sql server time has second resolution only
                                    self.assertEqual(
                                        expected_value.replace(
                                            microsecond=0).isoformat().replace(
                                                '+00:00', ''),
                                        actual_row["data"][column_name],
                                        msg="expected: {} != actual {}".format(
                                            expected_value.isoformat().replace(
                                                '+00:00', 'Z'),
                                            actual_row["data"][column_name]))
                                elif isinstance(expected_value, date):
                                    # sql server time has second resolution only
                                    self.assertEqual(
                                        expected_value.isoformat() +
                                        'T00:00:00+00:00',
                                        actual_row["data"][column_name],
                                        msg="expected: {} != actual {}".format(
                                            expected_value.isoformat() +
                                            'T00:00:00+00:00',
                                            actual_row["data"][column_name]))
                                else:
                                    self.assertEqual(
                                        expected_value,
                                        actual_row["data"][column_name],
                                        msg="expected: {} != actual {}".format(
                                            expected_value,
                                            actual_row["data"][column_name]))

                            elif expected_value:
                                # we have an expected value for a deleted row
                                try:
                                    actual_value = datetime.strptime(
                                        actual_row["data"][column_name],
                                        "%Y-%m-%dT%H:%M:%S.%fZ")
                                except ValueError:
                                    actual_value = datetime.strptime(
                                        actual_row["data"][column_name],
                                        "%Y-%m-%dT%H:%M:%SZ")
                                self.assertGreaterEqual(
                                    actual_value,
                                    expected_value - timedelta(seconds=15))
                                self.assertLessEqual(
                                    actual_value,
                                    expected_value + timedelta(seconds=15))
                            else:
                                # the row wasn't deleted so we can either not pass the column or it can be None
                                self.assertIsNone(
                                    actual_row["data"].get(column_name))

                print("records are correct for stream {}".format(stream))

                # verify state and bookmarks
                state = menagerie.get_state(conn_id)
                bookmark = state['bookmarks'][stream]

                self.assertIsNone(
                    state.get('currently_syncing'),
                    msg="expected state's currently_syncing to be None")
                self.assertIsNotNone(
                    bookmark.get('current_log_version'),
                    msg=
                    "expected bookmark to have current_log_version because we are using log replication"
                )
                self.assertTrue(bookmark['initial_full_table_complete'],
                                msg="expected full table to be complete")
                new_log_version = bookmark['current_log_version']
                self.assertGreater(new_log_version,
                                   inital_log_version,
                                   msg='expected log version to increase')

                self.assertEqual(
                    bookmark['version'],
                    table_version[stream],
                    msg="expected bookmark for stream to match version")
                self.assertEqual(
                    bookmark['version'],
                    new_table_version,
                    msg="expected bookmark for stream to match version")

                expected_schemas = self.expected_metadata()[stream]['schema']
                self.assertEqual(records_by_stream[stream]['schema'],
                                 expected_schemas,
                                 msg="expected: {} != actual: {}".format(
                                     expected_schemas,
                                     records_by_stream[stream]['schema']))
예제 #17
0
    def setUpClass(cls) -> None:
        """Create the expected schema in the test database"""

        database_name = "data_types_database"
        schema_name = "dbo"

        cls.EXPECTED_METADATA = {
            'data_types_database_dbo_integers': {
                'is-view':
                False,
                'schema-name':
                schema_name,
                'row-count':
                0,
                'values': [(0, -9223372036854775808, -2147483648, -32768),
                           (1, 0, 0, 0),
                           (2, 9223372036854775807, 2147483647, 32767),
                           (3, None, None, None),
                           (4, 5603121835631323156, 9665315, 11742),
                           (5, -4898597031243117659, 140946744, -16490),
                           (6, -5168593529138936444, -1746890910, 2150),
                           (7, 1331162887494168851, 1048867088, 12136),
                           (8, -4495110645908459596, -1971955745, 18257),
                           (9, -1575653240237191360, -533282078, 22022),
                           (10, 6203877631305833079, 271324086, -18782),
                           (11, 7293147954924079156, 1003163272, 3593),
                           (12, -1302715001442736465, -1626372079, 3788),
                           (13, -9062593720232233398, 1646478731, 17621)],
                'table-key-properties': {'pk'},
                'selected':
                None,
                'database-name':
                database_name,
                'stream_name':
                'integers',
                'fields': [{
                    'pk': {
                        'sql-datatype': 'int',
                        'selected-by-default': True,
                        'inclusion': 'automatic'
                    }
                }, {
                    'MyBigIntColumn': {
                        'sql-datatype': 'bigint',
                        'selected-by-default': True,
                        'inclusion': 'available'
                    }
                }, {
                    'MyIntColumn': {
                        'sql-datatype': 'int',
                        'selected-by-default': True,
                        'inclusion': 'available'
                    }
                }, {
                    'MySmallIntColumn': {
                        'sql-datatype': 'smallint',
                        'selected-by-default': True,
                        'inclusion': 'available'
                    }
                }],
                'schema': {
                    'type': 'object',
                    'properties': {
                        'MySmallIntColumn': {
                            'type': ['integer', 'null'],
                            'minimum': -32768,
                            'maximum': 32767,
                            'inclusion': 'available',
                            'selected': True
                        },
                        'pk': {
                            'type': ['integer'],
                            'minimum': -2147483648,
                            'maximum': 2147483647,
                            'inclusion': 'automatic',
                            'selected': True
                        },
                        'MyBigIntColumn': {
                            'type': ['integer', 'null'],
                            'minimum': -9223372036854775808,
                            'maximum': 9223372036854775807,
                            'inclusion': 'available',
                            'selected': True
                        },
                        'MyIntColumn': {
                            'type': ['integer', 'null'],
                            'minimum': -2147483648,
                            'maximum': 2147483647,
                            'inclusion': 'available',
                            'selected': True
                        }
                    },
                    'selected': True
                }
            },
            'data_types_database_dbo_tiny_integers_and_bools': {
                'is-view':
                False,
                'schema-name':
                schema_name,
                'row-count':
                0,
                'values': [(0, 0, False), (1, 255, True), (2, None, None),
                           (3, 230, False), (4, 6, True), (5, 236, True),
                           (6, 27, True), (7, 132, True), (8, 251, False),
                           (9, 187, True), (10, 157, True), (11, 51, True),
                           (12, 144, True)],
                'table-key-properties': {'pk'},
                'selected':
                None,
                'database-name':
                database_name,
                'stream_name':
                'tiny_integers_and_bools',
                'fields': [{
                    'pk': {
                        'sql-datatype': 'int',
                        'selected-by-default': True,
                        'inclusion': 'automatic'
                    }
                }, {
                    'MyTinyIntColumn': {
                        'sql-datatype': 'tinyint',
                        'selected-by-default': True,
                        'inclusion': 'available'
                    }
                }, {
                    'my_boolean': {
                        'sql-datatype': 'bit',
                        'selected-by-default': True,
                        'inclusion': 'available'
                    }
                }],
                'schema': {
                    'type': 'object',
                    'properties': {
                        'MyTinyIntColumn': {
                            'type': ['integer', 'null'],
                            'minimum': 0,
                            'maximum': 255,
                            'inclusion': 'available',
                            'selected': True
                        },
                        'pk': {
                            'type': ['integer'],
                            'minimum': -2147483648,
                            'maximum': 2147483647,
                            'inclusion': 'automatic',
                            'selected': True
                        },
                        'my_boolean': {
                            'type': ['boolean', 'null'],
                            'inclusion': 'available',
                            'selected': True
                        }
                    },
                    'selected': True
                }
            }
        }

        drop_all_user_databases()

        query_list = list(
            create_database(database_name, "Latin1_General_CS_AS"))

        table_name = "integers"
        column_name = [
            "pk", "MyBigIntColumn", "MyIntColumn", "MySmallIntColumn"
        ]
        column_type = ["int", "bigint", "int", "smallint"]
        primary_key = {"pk"}
        column_def = [" ".join(x) for x in list(zip(column_name, column_type))]
        query_list.extend(
            create_table(database_name,
                         schema_name,
                         table_name,
                         column_def,
                         primary_key=primary_key))
        query_list.extend(
            insert(
                database_name, schema_name, table_name,
                cls.EXPECTED_METADATA["data_types_database_dbo_integers"]
                ["values"]))

        table_name = "tiny_integers_and_bools"
        column_name = ["pk", "MyTinyIntColumn", "my_boolean"]
        column_type = ["int", "tinyint", "bit"]
        primary_key = {"pk"}
        column_def = [" ".join(x) for x in list(zip(column_name, column_type))]
        query_list.extend(
            create_table(database_name,
                         schema_name,
                         table_name,
                         column_def,
                         primary_key=primary_key))
        query_list.extend(
            insert(
                database_name, schema_name, table_name, cls.EXPECTED_METADATA[
                    "data_types_database_dbo_tiny_integers_and_bools"]
                ["values"]))

        mssql_cursor_context_manager(*query_list)
        cls.expected_metadata = cls.discovery_expected_metadata
예제 #18
0
    def setUpClass(cls) -> None:
        """Create the expected schema in the test database"""

        drop_all_user_databases()
        database_name = "constraints_database"
        schema_name = "dbo"
        query_list = list(
            create_database(database_name, "Latin1_General_CS_AS"))
        # query_list.extend(create_schema(database_name, schema_name))

        table_name = "no_constraints"
        cls.EXPECTED_METADATA = {
            '{}_{}_{}'.format(database_name, schema_name, table_name): {
                'is-view':
                False,
                'schema-name':
                schema_name,
                'row-count':
                0,
                'values': [(0, ), (1, )],
                'table-key-properties':
                set(),
                'selected':
                None,
                'database-name':
                database_name,
                'stream_name':
                table_name,
                'fields': [{
                    'column_name': {
                        'sql-datatype': 'int',
                        'selected-by-default': True,
                        'inclusion': 'available'
                    }
                }],
                'schema': {
                    'type': 'object',
                    'properties': {
                        'column_name': {
                            'type': ['integer', 'null'],
                            'minimum': -2147483648,
                            'maximum': 2147483647,
                            'inclusion': 'available',
                            'selected': True
                        }
                    },
                    'selected': True
                }
            },
        }

        column_name = ["column_name"]
        column_type = ["int"]
        primary_key = set()
        column_def = [" ".join(x) for x in list(zip(column_name, column_type))]
        query_list.extend(
            create_table(database_name,
                         schema_name,
                         table_name,
                         column_def,
                         primary_key=primary_key))
        query_list.extend(
            insert(
                database_name, schema_name, table_name,
                cls.EXPECTED_METADATA['{}_{}_{}'.format(
                    database_name, schema_name, table_name)]["values"]))

        table_name = "multiple_column_pk"
        primary_key = ["first_name", "last_name"]
        cls.EXPECTED_METADATA['{}_{}_{}'.format(
            database_name, schema_name, table_name)] = {
                'is-view':
                False,
                'schema-name':
                schema_name,
                'row-count':
                0,
                'values': [("Tim", "Berners-Lee", 64), ("Sergey", "Brin", 45),
                           ("Larry", "Page", 46)],
                'table-key-properties':
                primary_key,
                'selected':
                None,
                'database-name':
                database_name,
                'stream_name':
                table_name,
                'fields': [{
                    'first_name': {
                        'sql-datatype': 'varchar',
                        'selected-by-default': True,
                        'inclusion': 'automatic'
                    }
                }, {
                    'last_name': {
                        'sql-datatype': 'varchar',
                        'selected-by-default': True,
                        'inclusion': 'automatic'
                    }
                }, {
                    'info': {
                        'sql-datatype': 'int',
                        'selected-by-default': True,
                        'inclusion': 'available'
                    }
                }],
                'schema': {
                    'type': 'object',
                    'selected': True,
                    'properties': {
                        'info': {
                            'maximum': 2147483647,
                            'type': ['integer', 'null'],
                            'inclusion': 'available',
                            'selected': True,
                            'minimum': -2147483648
                        },
                        'first_name': {
                            'type': ['string'],
                            'maxLength': 256,
                            'inclusion': 'automatic',
                            'selected': True
                        },  # , 'minLength': 0},
                        'last_name': {
                            'type': ['string'],
                            'maxLength': 256,
                            'inclusion': 'automatic',
                            'selected': True
                        }
                    }
                }  # 'minLength': 0}}}
            }
        column_name = ["first_name", "last_name", "info"]
        column_type = ["varchar(256)", "varchar(256)", "int"]
        column_def = [" ".join(x) for x in list(zip(column_name, column_type))]
        query_list.extend(
            create_table(database_name,
                         schema_name,
                         table_name,
                         column_def,
                         primary_key=primary_key))
        query_list.extend(
            insert(
                database_name, schema_name, table_name,
                cls.EXPECTED_METADATA['{}_{}_{}'.format(
                    database_name, schema_name, table_name)]["values"]))

        table_name = "single_column_pk"
        primary_key = ["pk"]
        cls.EXPECTED_METADATA['{}_{}_{}'.format(
            database_name, schema_name, table_name)] = {
                'is-view':
                False,
                'schema-name':
                schema_name,
                'row-count':
                0,
                'values': [(0, 3), (1, 4)],
                'table-key-properties':
                primary_key,
                'selected':
                None,
                'database-name':
                database_name,
                'stream_name':
                table_name,
                'fields': [{
                    'pk': {
                        'sql-datatype': 'int',
                        'selected-by-default': True,
                        'inclusion': 'automatic'
                    }
                }, {
                    'data': {
                        'sql-datatype': 'int',
                        'selected-by-default': True,
                        'inclusion': 'available'
                    }
                }],
                'schema': {
                    'type': 'object',
                    'selected': True,
                    'properties': {
                        'pk': {
                            'maximum': 2147483647,
                            'type': ['integer'],
                            'inclusion': 'automatic',
                            'selected': True,
                            'minimum': -2147483648
                        },
                        'data': {
                            'maximum': 2147483647,
                            'type': ['integer', 'null'],
                            'inclusion': 'available',
                            'selected': True,
                            'minimum': -2147483648
                        }
                    }
                }
            }
        column_name = ["pk", "data"]
        column_type = ["int", "int"]
        column_def = [" ".join(x) for x in list(zip(column_name, column_type))]
        query_list.extend(
            create_table(database_name,
                         schema_name,
                         table_name,
                         column_def,
                         primary_key=primary_key))
        query_list.extend(
            insert(
                database_name, schema_name, table_name,
                cls.EXPECTED_METADATA['{}_{}_{}'.format(
                    database_name, schema_name, table_name)]["values"]))

        table_name = "pk_with_unique_not_null"
        primary_key = ["pk"]
        cls.EXPECTED_METADATA['{}_{}_{}'.format(
            database_name, schema_name, table_name)] = {
                'is-view':
                False,
                'schema-name':
                schema_name,
                'row-count':
                0,
                'values': [(0, 3), (1, 4)],
                'table-key-properties':
                primary_key,
                'selected':
                None,
                'database-name':
                database_name,
                'stream_name':
                table_name,
                'fields': [{
                    'pk': {
                        'sql-datatype': 'int',
                        'selected-by-default': True,
                        'inclusion': 'automatic'
                    }
                }, {
                    'data': {
                        'sql-datatype': 'int',
                        'selected-by-default': True,
                        'inclusion': 'available'
                    }
                }],
                'schema': {
                    'type': 'object',
                    'selected': True,
                    'properties': {
                        'pk': {
                            'maximum': 2147483647,
                            'type': ['integer'],
                            'inclusion': 'automatic',
                            'selected': True,
                            'minimum': -2147483648
                        },
                        'data': {
                            'maximum': 2147483647,
                            'type': ['integer', 'null'],
                            'inclusion': 'available',
                            'selected': True,
                            'minimum': -2147483648
                        }
                    }
                }
            }
        column_name = ["pk", "data"]
        column_type = ["int", "int NOT NULL UNIQUE"]
        column_def = [" ".join(x) for x in list(zip(column_name, column_type))]
        query_list.extend(
            create_table(database_name,
                         schema_name,
                         table_name,
                         column_def,
                         primary_key=primary_key))
        query_list.extend(
            insert(
                database_name, schema_name, table_name,
                cls.EXPECTED_METADATA['{}_{}_{}'.format(
                    database_name, schema_name, table_name)]["values"]))

        table_name = "pk_with_fk"
        primary_key = ["pk"]
        cls.EXPECTED_METADATA['{}_{}_{}'.format(
            database_name, schema_name, table_name)] = {
                'is-view':
                False,
                'schema-name':
                schema_name,
                'row-count':
                0,
                'values': [(0, 1), (1, 0), (2, 0), (3, 1), (4, None)],
                'table-key-properties':
                primary_key,
                'selected':
                None,
                'database-name':
                database_name,
                'stream_name':
                table_name,
                'fields': [{
                    'pk': {
                        'sql-datatype': 'int',
                        'selected-by-default': True,
                        'inclusion': 'automatic'
                    }
                }, {
                    'fk': {
                        'sql-datatype': 'int',
                        'selected-by-default': True,
                        'inclusion': 'available'
                    }
                }],
                'schema': {
                    'type': 'object',
                    'selected': True,
                    'properties': {
                        'pk': {
                            'maximum': 2147483647,
                            'type': ['integer'],
                            'inclusion': 'automatic',
                            'selected': True,
                            'minimum': -2147483648
                        },
                        'fk': {
                            'maximum': 2147483647,
                            'type': ['integer', 'null'],
                            'inclusion': 'available',
                            'selected': True,
                            'minimum': -2147483648
                        }
                    }
                }
            }
        column_name = ["pk", "fk"]
        column_type = ["int", "int"]
        foreign_key = "fk"
        reference = "{}.pk_with_unique_not_null(pk)".format(schema_name)
        column_def = [" ".join(x) for x in list(zip(column_name, column_type))]
        query_list.extend(
            create_table(database_name,
                         schema_name,
                         table_name,
                         column_def,
                         primary_key=primary_key,
                         foreign_key=foreign_key,
                         reference=reference))
        query_list.extend(
            insert(
                database_name, schema_name, table_name,
                cls.EXPECTED_METADATA['{}_{}_{}'.format(
                    database_name, schema_name, table_name)]["values"]))

        table_name = "view_with_join"
        primary_key = []
        cls.EXPECTED_METADATA['{}_{}_{}'.format(
            database_name, schema_name, table_name)] = {
                'is-view':
                True,
                'schema-name':
                schema_name,
                'row-count':
                0,
                'values': [(1, 4, 0), (0, 3, 1), (0, 3, 2), (1, 4, 3),
                           (None, None, 4)],
                'table-key-properties':
                primary_key,
                'selected':
                None,
                'database-name':
                database_name,
                'stream_name':
                table_name,
                'fields': [{
                    'column1': {
                        'sql-datatype': 'int',
                        'selected-by-default': True,
                        'inclusion': 'available'
                    }
                }, {
                    'data': {
                        'sql-datatype': 'int',
                        'selected-by-default': True,
                        'inclusion': 'available'
                    }
                }, {
                    'column2': {
                        'sql-datatype': 'int',
                        'selected-by-default': True,
                        'inclusion': 'available'
                    }
                }],
                'schema': {
                    'type': 'object',
                    'selected': True,
                    'properties': {
                        'column1': {
                            'maximum': 2147483647,
                            'type': ['integer', 'null'],
                            'inclusion': 'available',
                            'selected': True,
                            'minimum': -2147483648
                        },
                        'data': {
                            'maximum': 2147483647,
                            'type': ['integer', 'null'],
                            'inclusion': 'available',
                            'selected': True,
                            'minimum': -2147483648
                        },
                        'column2': {
                            'maximum': 2147483647,
                            'type': ['integer', 'null'],
                            'inclusion': 'available',
                            'selected': True,
                            'minimum': -2147483648
                        }
                    }
                }
            }
        select = ("SELECT p.pk as column1, data, f.pk as column2 "
                  "FROM pk_with_unique_not_null p "
                  "RIGHT JOIN pk_with_fk f on p.pk = f.fk")
        query_list.extend(create_view(schema_name, table_name, select))

        table_name = "table_with_index"
        primary_key = []
        cls.EXPECTED_METADATA['{}_{}_{}'.format(
            database_name, schema_name, table_name)] = {
                'is-view':
                False,
                'schema-name':
                schema_name,
                'row-count':
                0,
                'values': [(0, 3), (1, 4)],
                'table-key-properties':
                primary_key,
                'selected':
                None,
                'database-name':
                database_name,
                'stream_name':
                table_name,
                'fields': [{
                    'not_pk': {
                        'sql-datatype': 'int',
                        'selected-by-default': True,
                        'inclusion': 'available'
                    }
                }, {
                    'data': {
                        'sql-datatype': 'int',
                        'selected-by-default': True,
                        'inclusion': 'available'
                    }
                }],
                'schema': {
                    'type': 'object',
                    'selected': True,
                    'properties': {
                        'not_pk': {
                            'maximum': 2147483647,
                            'type': ['integer', 'null'],
                            'inclusion': 'available',
                            'selected': True,
                            'minimum': -2147483648
                        },
                        'data': {
                            'maximum': 2147483647,
                            'type': ['integer', 'null'],
                            'inclusion': 'available',
                            'selected': True,
                            'minimum': -2147483648
                        }
                    }
                }
            }
        column_name = ["not_pk", "data"]
        column_type = ["int", "int NOT NULL INDEX myindex"]

        column_def = [" ".join(x) for x in list(zip(column_name, column_type))]
        query_list.extend(
            create_table(database_name,
                         schema_name,
                         table_name,
                         column_def,
                         primary_key=primary_key))
        query_list.extend(
            insert(
                database_name, schema_name, table_name,
                cls.EXPECTED_METADATA['{}_{}_{}'.format(
                    database_name, schema_name, table_name)]["values"]))

        table_name = "default_column"
        primary_key = ["pk"]
        cls.EXPECTED_METADATA['{}_{}_{}'.format(
            database_name, schema_name, table_name)] = {
                'is-view':
                False,
                'schema-name':
                schema_name,
                'row-count':
                0,
                'values': [(0, ), (1, )],
                'table-key-properties':
                primary_key,
                'selected':
                None,
                'database-name':
                database_name,
                'stream_name':
                table_name,
                'fields': [{
                    'pk': {
                        'sql-datatype': 'int',
                        'selected-by-default': True,
                        'inclusion': 'automatic'
                    }
                }, {
                    'default_column': {
                        'sql-datatype': 'int',
                        'selected-by-default': True,
                        'inclusion': 'available'
                    }
                }],
                'schema': {
                    'type': 'object',
                    'selected': True,
                    'properties': {
                        'pk': {
                            'maximum': 2147483647,
                            'type': ['integer'],
                            'inclusion': 'automatic',
                            'selected': True,
                            'minimum': -2147483648
                        },
                        'default_column': {
                            'maximum': 2147483647,
                            'type': ['integer', 'null'],
                            'inclusion': 'available',
                            'selected': True,
                            'minimum': -2147483648
                        }
                    }
                }
            }
        column_name = ["pk", "default_column"]
        column_type = ["int", "int DEFAULT -1"]
        column_def = [" ".join(x) for x in list(zip(column_name, column_type))]
        query_list.extend(
            create_table(database_name,
                         schema_name,
                         table_name,
                         column_def,
                         primary_key=primary_key))
        query_list.extend(
            insert(database_name,
                   schema_name,
                   table_name,
                   cls.EXPECTED_METADATA['{}_{}_{}'.format(
                       database_name, schema_name, table_name)]["values"],
                   column_names=["pk"]))
        cls.EXPECTED_METADATA['{}_{}_{}'.format(database_name, schema_name,
                                                table_name)]["values"] = [
                                                    (0, -1), (1, -1)
                                                ]

        table_name = "check_constraint"
        primary_key = ["pk"]
        cls.EXPECTED_METADATA['{}_{}_{}'.format(
            database_name, schema_name, table_name)] = {
                'is-view':
                False,
                'schema-name':
                schema_name,
                'row-count':
                0,
                'values': [(0, 120), (1, 34)],
                'table-key-properties':
                primary_key,
                'selected':
                None,
                'database-name':
                database_name,
                'stream_name':
                table_name,
                'fields': [{
                    'pk': {
                        'sql-datatype': 'int',
                        'selected-by-default': True,
                        'inclusion': 'automatic'
                    }
                }, {
                    'age': {
                        'sql-datatype': 'int',
                        'selected-by-default': True,
                        'inclusion': 'available'
                    }
                }],
                'schema': {
                    'type': 'object',
                    'selected': True,
                    'properties': {
                        'pk': {
                            'maximum': 2147483647,
                            'type': ['integer'],
                            'inclusion': 'automatic',
                            'selected': True,
                            'minimum': -2147483648
                        },
                        'age': {
                            'maximum': 2147483647,
                            'type': ['integer', 'null'],
                            'inclusion': 'available',
                            'selected': True,
                            'minimum': -2147483648
                        }
                    }
                }
            }
        column_name = ["pk", "age"]
        column_type = ["int", "int CHECK (age <= 120)"]
        column_def = [" ".join(x) for x in list(zip(column_name, column_type))]
        query_list.extend(
            create_table(database_name,
                         schema_name,
                         table_name,
                         column_def,
                         primary_key=primary_key))
        query_list.extend(
            insert(
                database_name, schema_name, table_name,
                cls.EXPECTED_METADATA['{}_{}_{}'.format(
                    database_name, schema_name, table_name)]["values"]))

        table_name = "even_identity"
        primary_key = ["pk"]
        cls.EXPECTED_METADATA['{}_{}_{}'.format(
            database_name, schema_name, table_name)] = {
                'is-view':
                False,
                'schema-name':
                schema_name,
                'row-count':
                0,
                'values': [(1, ), (2, )],
                'table-key-properties':
                primary_key,
                'selected':
                None,
                'database-name':
                database_name,
                'stream_name':
                table_name,
                'fields': [{
                    'pk': {
                        'sql-datatype': 'int',
                        'selected-by-default': True,
                        'inclusion': 'automatic'
                    }
                }, {
                    'even_id': {
                        'sql-datatype': 'int',
                        'selected-by-default': True,
                        'inclusion': 'available'
                    }
                }],
                'schema': {
                    'type': 'object',
                    'selected': True,
                    'properties': {
                        'pk': {
                            'maximum': 2147483647,
                            'type': ['integer'],
                            'inclusion': 'automatic',
                            'selected': True,
                            'minimum': -2147483648
                        },
                        'even_id': {
                            'maximum': 2147483647,
                            'type': ['integer', 'null'],
                            'inclusion': 'available',
                            'selected': True,
                            'minimum': -2147483648
                        }
                    }
                }
            }
        column_name = ["pk", "even_id"]
        column_type = ["int", "int IDENTITY(2,2)"]
        column_def = [" ".join(x) for x in list(zip(column_name, column_type))]
        query_list.extend(
            create_table(database_name,
                         schema_name,
                         table_name,
                         column_def,
                         primary_key=primary_key))
        query_list.extend(
            insert(database_name,
                   schema_name,
                   table_name,
                   cls.EXPECTED_METADATA['{}_{}_{}'.format(
                       database_name, schema_name, table_name)]["values"],
                   column_names=["pk"]))
        cls.EXPECTED_METADATA['{}_{}_{}'.format(database_name, schema_name,
                                                table_name)]["values"] = [
                                                    (1, 2), (2, 4)
                                                ]
        mssql_cursor_context_manager(*query_list)

        cls.expected_metadata = cls.discovery_expected_metadata
예제 #19
0
    def test_run(self):
        """stream_expected_data[self.VALUES]
        Verify that a full sync can send capture all data and send it in the correct format
        for integer and boolean (bit) data.
        Verify that the fist sync sends an activate immediately.
        Verify that the table version is incremented up
        """
        print("running test {}".format(self.name()))

        conn_id = self.create_connection()

        # run in check mode
        check_job_name = runner.run_check_mode(self, conn_id)

        # verify check  exit codes
        exit_status = menagerie.get_exit_status(conn_id, check_job_name)
        menagerie.verify_check_exit_status(self, exit_status, check_job_name)

        # get the catalog information of discovery
        found_catalogs = menagerie.get_catalogs(conn_id)
        additional_md = [{
            "breadcrumb": [],
            "metadata": {
                'replication-method': 'INCREMENTAL',
                'replication-key': 'replication_key_column'
            }
        }]
        BaseTapTest.select_all_streams_and_fields(conn_id,
                                                  found_catalogs,
                                                  additional_md=additional_md)

        # clear state
        menagerie.set_state(conn_id, {})
        sync_job_name = runner.run_sync_mode(self, conn_id)

        # verify tap and target exit codes
        exit_status = menagerie.get_exit_status(conn_id, sync_job_name)
        menagerie.verify_sync_exit_status(self, exit_status, sync_job_name)

        # verify record counts of streams
        record_count_by_stream = runner.examine_target_output_file(
            self, conn_id, self.expected_streams(),
            self.expected_primary_keys_by_stream_id())
        expected_count = {
            k: len(v['values'])
            for k, v in self.expected_metadata().items()
        }
        self.assertEqual(record_count_by_stream, expected_count)

        # verify records match on the first sync
        records_by_stream = runner.get_records_from_target_output()

        table_version = dict()
        for stream in self.expected_streams():
            with self.subTest(stream=stream):
                stream_expected_data = self.expected_metadata()[stream]
                table_version[stream] = records_by_stream[stream][
                    'table_version']

                # verify on the first sync you get
                # activate version message before and after all data for the full table
                # and before the logical replication part
                self.assertEqual(
                    records_by_stream[stream]['messages'][0]['action'],
                    'activate_version')

                self.assertTrue(
                    all([
                        m["action"] == "upsert"
                        for m in records_by_stream[stream]['messages'][1:-1]
                    ]),
                    msg="Expect all but the first message to be upserts")
                self.assertEqual(
                    len(stream_expected_data[self.VALUES]),
                    len(records_by_stream[stream]['messages'][1:-1]),
                    msg="incorrect number of upserts")

                column_names = [
                    list(field_data.keys())[0]
                    for field_data in stream_expected_data[self.FIELDS]
                ]

                expected_messages = [{
                    "action": "upsert",
                    "data": {
                        column: value
                        for column, value in list(zip(column_names,
                                                      row_values))
                    }
                } for row_values in sorted(stream_expected_data[self.VALUES],
                                           key=lambda row:
                                           (row[1] is not None, row[1]))]

                # Verify all data is correct for incremental
                for expected_row, actual_row in list(
                        zip(expected_messages,
                            records_by_stream[stream]['messages'][1:])):
                    with self.subTest(expected_row=expected_row):
                        self.assertEqual(actual_row["action"], "upsert")
                        self.assertEqual(
                            len(expected_row["data"].keys()),
                            len(actual_row["data"].keys()),
                            msg="there are not the same number of columns")
                        for column_name, expected_value in expected_row[
                                "data"].items():
                            column_index = [
                                list(key.keys())[0] for key in
                                self.expected_metadata()[stream][self.FIELDS]
                            ].index(column_name)
                            if self.expected_metadata()[stream][self.FIELDS][column_index][column_name][self.DATATYPE] \
                                    in ("real", "float") \
                                    and actual_row["data"][column_name] is not None:
                                self.assertEqual(
                                    type(actual_row["data"][column_name]),
                                    Decimal,
                                    msg=
                                    "float value is not represented as a number"
                                )
                                self.assertEqual(
                                    float(str(float32(expected_value))),
                                    float(
                                        str(
                                            float32(actual_row["data"]
                                                    [column_name]))),
                                    msg=
                                    "single value of {} doesn't match actual {}"
                                    .format(
                                        float(str(float32(expected_value))),
                                        float(
                                            str(
                                                float32(actual_row["data"]
                                                        [column_name])))))
                            else:
                                self.assertEqual(
                                    expected_value,
                                    actual_row["data"][column_name],
                                    msg="expected: {} != actual {}".format(
                                        expected_row, actual_row))
                print("records are correct for stream {}".format(stream))

                # verify state and bookmarks
                state = menagerie.get_state(conn_id)
                bookmark = state['bookmarks'][stream]

                self.assertIsNone(
                    state.get('currently_syncing'),
                    msg="expected state's currently_syncing to be None")
                self.assertIsNone(bookmark.get('current_log_version'),
                                  msg="no log_version for incremental")
                self.assertIsNone(bookmark.get('initial_full_table_complete'),
                                  msg="no full table for incremental")
                # find the max value of the replication key
                self.assertEqual(
                    bookmark['replication_key_value'],
                    max([
                        row[1] for row in stream_expected_data[self.VALUES]
                        if row[1] is not None
                    ]))
                # self.assertEqual(bookmark['replication_key'], 'replication_key_value')

                self.assertEqual(
                    bookmark['version'],
                    table_version[stream],
                    msg="expected bookmark for stream to match version")

                expected_schemas = self.expected_metadata()[stream]['schema']
                self.assertEqual(records_by_stream[stream]['schema'],
                                 expected_schemas,
                                 msg="expected: {} != actual: {}".format(
                                     expected_schemas,
                                     records_by_stream[stream]['schema']))

        # ----------------------------------------------------------------------
        # invoke the sync job AGAIN and after insert, update, delete or rows
        # ----------------------------------------------------------------------

        database_name = "data_types_database"
        schema_name = "dbo"
        table_name = "float_precisions"
        column_name = [
            "pk", "replication_key_column", "float_53", "real_24_bits"
        ]
        insert_value = [(15, 100, 100, 100),
                        (14, 3.4028235e+38, 1.7976931348623157e+308,
                         3.4028235e+38)]
        update_value = [(4, 101, 101, 101),
                        (6, 3.4028233e+38, 1.7976931348623157e+308,
                         3.4028235e+38)]
        delete_value = [(5, )]
        query_list = (insert(database_name, schema_name, table_name,
                             insert_value))
        query_list.extend(
            delete_by_pk(database_name, schema_name, table_name, delete_value,
                         column_name[:1]))
        query_list.extend(
            update_by_pk(database_name, schema_name, table_name, update_value,
                         column_name))
        mssql_cursor_context_manager(*query_list)
        insert_value = insert_value[-1:]  # only repl_key >= gets included
        update_value = update_value[-1:]
        self.EXPECTED_METADATA["data_types_database_dbo_float_precisions"]["values"] = \
            [(1, 3.4028230e+38, 1.7976931348623157e+308, 3.4028235e+38)] + update_value + insert_value

        sync_job_name = runner.run_sync_mode(self, conn_id)

        # verify tap and target exit codes
        exit_status = menagerie.get_exit_status(conn_id, sync_job_name)
        menagerie.verify_sync_exit_status(self, exit_status, sync_job_name)
        record_count_by_stream = runner.examine_target_output_file(
            self, conn_id, self.expected_streams(),
            self.expected_primary_keys_by_stream_id())
        expected_count = {
            k: len(v['values'])
            for k, v in self.expected_metadata().items()
        }
        self.assertEqual(record_count_by_stream, expected_count)
        records_by_stream = runner.get_records_from_target_output()

        for stream in self.expected_streams():
            with self.subTest(stream=stream):
                stream_expected_data = self.expected_metadata()[stream]
                new_table_version = records_by_stream[stream]['table_version']

                # verify on a subsequent sync you get activate version message only after all data
                self.assertEqual(
                    records_by_stream[stream]['messages'][0]['action'],
                    'activate_version')
                self.assertEqual(
                    records_by_stream[stream]['messages'][-1]['action'],
                    'activate_version')
                self.assertTrue(
                    all([
                        message["action"] == "upsert" for message in
                        records_by_stream[stream]['messages'][1:-1]
                    ]))

                column_names = [
                    list(field_data.keys())[0]
                    for field_data in stream_expected_data[self.FIELDS]
                ]

                expected_messages = [{
                    "action": "upsert",
                    "data": {
                        column: value
                        for column, value in list(zip(column_names,
                                                      row_values))
                    }
                } for row_values in sorted(stream_expected_data[self.VALUES],
                                           key=lambda row:
                                           (row[1] is not None, row[1]))]

                # remove sequences from actual values for comparison
                [
                    message.pop("sequence")
                    for message in records_by_stream[stream]['messages'][1:-1]
                ]

                # Verify all data is correct
                for expected_row, actual_row in list(
                        zip(expected_messages,
                            records_by_stream[stream]['messages'][1:-1])):
                    with self.subTest(expected_row=expected_row):
                        self.assertEqual(actual_row["action"], "upsert")

                        # we only send the _sdc_deleted_at column for deleted rows
                        self.assertEqual(
                            len(expected_row["data"].keys()),
                            len(actual_row["data"].keys()),
                            msg="there are not the same number of columns")
                        for column_name, expected_value in expected_row[
                                "data"].items():
                            column_index = [
                                list(key.keys())[0] for key in
                                self.expected_metadata()[stream][self.FIELDS]
                            ].index(column_name)
                            if self.expected_metadata()[stream][self.FIELDS][column_index][column_name][self.DATATYPE] \
                                    in ("real", "float") \
                                    and actual_row["data"][column_name] is not None:
                                self.assertEqual(
                                    type(actual_row["data"][column_name]),
                                    Decimal,
                                    msg=
                                    "float value is not represented as a number"
                                )
                                self.assertEqual(
                                    float(str(float32(expected_value))),
                                    float(
                                        str(
                                            float32(actual_row["data"]
                                                    [column_name]))),
                                    msg=
                                    "single value of {} doesn't match actual {}"
                                    .format(
                                        float(str(float32(expected_value))),
                                        float(
                                            str(
                                                float32(actual_row["data"]
                                                        [column_name])))))
                            else:
                                self.assertEqual(
                                    expected_value,
                                    actual_row["data"][column_name],
                                    msg="expected: {} != actual {}".format(
                                        expected_row, actual_row))

                print("records are correct for stream {}".format(stream))

                # verify state and bookmarks
                state = menagerie.get_state(conn_id)
                bookmark = state['bookmarks'][stream]

                self.assertIsNone(
                    state.get('currently_syncing'),
                    msg="expected state's currently_syncing to be None")
                self.assertIsNone(bookmark.get('current_log_version'),
                                  msg="no log_version for incremental")
                self.assertIsNone(bookmark.get('initial_full_table_complete'),
                                  msg="no full table for incremental")
                # find the max value of the replication key
                self.assertEqual(
                    bookmark['replication_key_value'],
                    max([
                        row[1] for row in stream_expected_data[self.VALUES]
                        if row[1] is not None
                    ]))
                # self.assertEqual(bookmark['replication_key'], 'replication_key_value')

                self.assertEqual(
                    bookmark['version'],
                    table_version[stream],
                    msg="expected bookmark for stream to match version")
                self.assertEqual(
                    bookmark['version'],
                    new_table_version,
                    msg="expected bookmark for stream to match version")

                state = menagerie.get_state(conn_id)
                bookmark = state['bookmarks'][stream]

                expected_schemas = self.expected_metadata()[stream]['schema']
                self.assertEqual(records_by_stream[stream]['schema'],
                                 expected_schemas,
                                 msg="expected: {} != actual: {}".format(
                                     expected_schemas,
                                     records_by_stream[stream]['schema']))