Пример #1
0
    def run_and_verify_sync(self, conn_id, second_sync=False):
        """
        Run a sync job and make sure it exited properly.
        Return a dictionary with keys of streams synced
        and values of records synced for each stream
        """
        # Run a sync job using orchestrator
        sync_job_name = runner.run_sync_mode(self, conn_id)

        # Verify tap and target exit codes
        exit_status = menagerie.get_exit_status(conn_id, sync_job_name)
        menagerie.verify_sync_exit_status(self, exit_status, sync_job_name)

        # Verify actual rows were synced
        sync_record_count = runner.examine_target_output_file(
            self, conn_id,
            self.expected_first_sync_streams()
            if not second_sync else self.expected_second_sync_streams(),
            self.expected_pks())
        self.assertGreater(
            sum(sync_record_count.values()),
            0,
            msg="failed to replicate any data: {}".format(sync_record_count))
        print("total replicated row count: {}".format(
            sum(sync_record_count.values())))

        return sync_record_count
    def do_test(self, conn_id):
        # Select our catalogs
        our_catalogs = [c for c in self.found_catalogs if c.get('tap_stream_id') in self.expected_sync_streams()]
        for c in our_catalogs:
            c_annotated = menagerie.get_annotated_schema(conn_id, c['stream_id'])
            c_metadata = metadata.to_map(c_annotated['metadata'])
            connections.select_catalog_and_fields_via_metadata(conn_id, c, c_annotated, [], [])

        # Clear state before our run
        menagerie.set_state(conn_id, {})

        # Run a sync job using orchestrator
        sync_job_name = runner.run_sync_mode(self, conn_id)

        # Verify tap and target exit codes
        exit_status = menagerie.get_exit_status(conn_id, sync_job_name)
        menagerie.verify_sync_exit_status(self, exit_status, sync_job_name)

        # Verify actual rows were synced
        record_count_by_stream = runner.examine_target_output_file(self, conn_id, self.expected_sync_streams(), self.expected_pks())
        replicated_row_count =  reduce(lambda accum,c : accum + c, record_count_by_stream.values())
        self.assertGreater(replicated_row_count, 0, msg="failed to replicate any data: {}".format(record_count_by_stream))
        print("total replicated row count: {}".format(replicated_row_count))

        # Ensure all records have a value for PK(s)
        records = runner.get_records_from_target_output()
        for stream in self.expected_sync_streams():
            messages = records.get(stream,{}).get('messages',[])
            if stream in  ['tickets', 'groups', 'users']:
                self.assertGreater(len(messages), 100, msg="Stream {} has fewer than 100 records synced".format(stream))
            for m in messages:
                pk_set = self.expected_pks()[stream]
                for pk in pk_set:
                    self.assertIsNotNone(m.get('data', {}).get(pk), msg="Missing primary-key for message {}".format(m))
Пример #3
0
    def run_test(self):
        conn_id = connections.ensure_connection(self)

        # run in discovery mode
        check_job_name = runner.run_check_mode(self, conn_id)

        # verify check  exit codes
        exit_status = menagerie.get_exit_status(conn_id, check_job_name)
        menagerie.verify_check_exit_status(self, exit_status, check_job_name)

        # verify the tap discovered the right streams
        catalog = menagerie.get_catalogs(conn_id)
        found_catalog_names = set(map(lambda c: c['tap_stream_id'], catalog))

        # assert we find the correct streams
        self.assertEqual(self.expected_check_streams(), found_catalog_names)

        for tap_stream_id in self.expected_check_streams():
            found_stream = [
                c for c in catalog if c['tap_stream_id'] == tap_stream_id
            ][0]
            schema_and_metadata = menagerie.get_annotated_schema(
                conn_id, found_stream['stream_id'])
            main_metadata = schema_and_metadata["metadata"]
            stream_metadata = [
                mdata for mdata in main_metadata if mdata["breadcrumb"] == []
            ]

            # assert that the pks are correct
            self.assertEqual(
                self.expected_pks()[tap_stream_id],
                set(stream_metadata[0]['metadata']['table-key-properties']))

        for stream_catalog in catalog:
            annotated_schema = menagerie.get_annotated_schema(
                conn_id, stream_catalog['stream_id'])
            selected_metadata = connections.select_catalog_and_fields_via_metadata(
                conn_id, stream_catalog, annotated_schema['annotated-schema'],
                [])

        # Run sync
        sync_job_name = runner.run_sync_mode(self, conn_id)

        exit_status = menagerie.get_exit_status(conn_id, sync_job_name)
        menagerie.verify_sync_exit_status(self, exit_status, sync_job_name)

        # verify the persisted schema was correct
        messages_by_stream = runner.get_records_from_target_output()

        # assert that each of the streams that we synced are the ones that we expect to see
        record_count_by_stream = runner.examine_target_output_file(
            self, conn_id, self.expected_first_sync_streams(),
            self.expected_pks())

        # Verify that the full table was syncd
        for tap_stream_id in self.expected_first_sync_streams():
            self.assertEqual(
                self.expected_first_sync_row_counts()[tap_stream_id],
                record_count_by_stream[tap_stream_id])
Пример #4
0
    def run_sync_and_get_record_count(self, conn_id):
        sync_job_name = runner.run_sync_mode(self, conn_id)
        exit_status = menagerie.get_exit_status(conn_id, sync_job_name)
        menagerie.verify_sync_exit_status(self, exit_status, sync_job_name)

        return runner.examine_target_output_file(self, conn_id,
                                                 self.expected_sync_streams(),
                                                 self.expected_pks())
Пример #5
0
    def test_run(self):
        # Select our catalogs
        # found_catalogs = menagerie.get_catalogs(conn_id)
        # our_catalogs = [c for c in found_catalogs if c.get('tap_stream_id') in self.expected_sync_streams()]
        # for c in our_catalogs:
        #     c_annotated = menagerie.get_annotated_schema(conn_id, c['stream_id'])
        #     c_metadata = metadata.to_map(c_annotated['metadata'])
        #     connections.select_catalog_and_fields_via_metadata(conn_id, c, c_annotated, [], [])

        conn_id = self.create_connection()

        # Clear state before our run
        menagerie.set_state(conn_id, {})
        # Select a stream
        found_catalogs = menagerie.get_catalogs(conn_id)
        our_catalogs = [catalog for catalog in found_catalogs if catalog.get('tap_stream_id') in self.expected_sync_streams()]
        self.select_all_streams_and_fields(conn_id, our_catalogs, select_all_fields=False)

        # Run a sync job using orchestrator
        sync_job_name = runner.run_sync_mode(self, conn_id)

        # Verify tap and target exit codes
        exit_status = menagerie.get_exit_status(conn_id, sync_job_name)
        menagerie.verify_sync_exit_status(self, exit_status, sync_job_name)

        # Verify actual rows were synced
        record_count_by_stream = runner.examine_target_output_file(self, conn_id, self.expected_sync_streams(), self.expected_pks())
        replicated_row_count =  sum(record_count_by_stream.values())
        self.assertGreater(replicated_row_count, 0, msg="failed to replicate any data: {}".format(record_count_by_stream))
        print("total replicated row count: {}".format(replicated_row_count))

        # Ensure all records have a value for PK(s)
        records = runner.get_records_from_target_output()
        for stream in self.expected_sync_streams():
            messages = records.get(stream, {}).get('messages')
            for m in messages:
                pk_set = self.expected_pks()[stream]
                for pk in pk_set:
                    self.assertIsNotNone(m.get('data', {}).get(pk), msg="oh no! {}".format(m))

        bookmarks = menagerie.get_state(conn_id)['bookmarks']

        replication_methods = self.expected_replication_method()

        for stream in self.expected_sync_streams():
            with self.subTest(stream=stream):
                replication_method = replication_methods.get(stream)
                if replication_method is self.INCREMENTAL:
                    self.assertTrue(stream in bookmarks)

                elif replication_method is self.FULL_TABLE:
                    self.assertTrue(stream not in bookmarks)

                else:
                    raise NotImplementedError(
                        "stream {} has an invalid replication method {}".format(stream, replication_method)
                    )
Пример #6
0
    def first_sync_test(self, table_configs, conn_id):
        # run first full table sync
        sync_job_name = runner.run_sync_mode(self, conn_id)

        exit_status = menagerie.get_exit_status(conn_id, sync_job_name)
        menagerie.verify_sync_exit_status(self, exit_status, sync_job_name)

        # verify the persisted schema was correct
        records_by_stream = runner.get_records_from_target_output()
        expected_pks = {}

        for config in table_configs:
            key = {config['HashKey']}
            if config.get('SortKey'):
                key |= {config.get('SortKey')}
            expected_pks[config['TableName']] = key

        # assert that each of the streams that we synced are the ones that we expect to see
        record_count_by_stream = runner.examine_target_output_file(
            self, conn_id, {x['TableName']
                            for x in table_configs}, expected_pks)

        state = menagerie.get_state(conn_id)
        state_version = menagerie.get_state_version(conn_id)

        first_versions = {}

        # assert that we get the correct number of records for each stream
        for config in table_configs:
            table_name = config['TableName']

            self.assertEqual(config['num_rows'],
                             record_count_by_stream[table_name])

            # assert that an activate_version_message is first and last message sent for each stream
            self.assertEqual(
                'activate_version',
                records_by_stream[table_name]['messages'][0]['action'])
            self.assertEqual(
                'activate_version',
                records_by_stream[table_name]['messages'][-1]['action'])

            # assert that the state has an initial_full_table_complete == True
            self.assertTrue(
                state['bookmarks'][table_name]['initial_full_table_complete'])
            # assert that there is a version bookmark in state
            first_versions[table_name] = state['bookmarks'][table_name][
                'version']
            self.assertIsNotNone(first_versions[table_name])

            # Write state with missing finished_shards so it
            # re-reads data from all shards
            # This should result in the next sync having same number of records
            # as the full table sync
            state['bookmarks'][table_name].pop('finished_shards')
            menagerie.set_state(conn_id, state, version=state_version)
Пример #7
0
    def test_catalog_without_properties(self):

        self.setUpTestEnvironment()

        runner.run_check_job_and_check_status(self)

        found_catalogs = menagerie.get_catalogs(self.conn_id)
        self.assertEqual(len(found_catalogs), 1,
                         msg="unable to locate schemas for connection {}".format(self.conn_id))

        found_catalog_names = set(
            map(lambda c: c['tap_stream_id'], found_catalogs))
        subset = self.expected_streams().issubset(found_catalog_names)
        self.assertTrue(
            subset, msg="Expected check streams are not subset of discovered catalog")

        our_catalogs = [c for c in found_catalogs if c.get(
            'tap_stream_id') in self.expected_streams()]

        # Select our catalogs
        for c in our_catalogs:
            c_annotated = menagerie.get_annotated_schema(
                self.conn_id, c['stream_id'])
            connections.select_catalog_and_fields_via_metadata(
                self.conn_id, c, c_annotated, [], [])

        # Clear state before our run
        menagerie.set_state(self.conn_id, {})

        # Run a sync job using orchestrator
        sync_job_name = runner.run_sync_mode(self, self.conn_id)

        # Verify tap and target exit codes
        exit_status = menagerie.get_exit_status(self.conn_id, sync_job_name)
        menagerie.verify_sync_exit_status(self, exit_status, sync_job_name)

        synced_records = runner.get_records_from_target_output()
        upsert_messages = [m for m in synced_records.get(
            'csv_with_empty_lines').get('messages') if m['action'] == 'upsert']

        records = [message.get('data') for message in upsert_messages]

        #Empty line should be ignored in emitted records.

        expected_records = [
            {'id': 1, 'name': 'John', '_sdc_extra': [{'name': 'carl'}], '_sdc_source_bucket': 'com-stitchdata-prod-circleci-assets',
                '_sdc_source_file': 'tap_tester/test_csv_with_empty_lines.csv', '_sdc_source_lineno': 2},
            {'id': 2, 'name': 'Bob', '_sdc_source_bucket': 'com-stitchdata-prod-circleci-assets',
                '_sdc_source_file': 'tap_tester/test_csv_with_empty_lines.csv', '_sdc_source_lineno': 3},
            {'id': 3, '_sdc_source_bucket': 'com-stitchdata-prod-circleci-assets',
                '_sdc_source_file': 'tap_tester/test_csv_with_empty_lines.csv', '_sdc_source_lineno': 4},
            {'id': 4, 'name': 'Alice', '_sdc_extra': [{'no_headers': ['Ben', '5']}, {
                'name': 'Barak'}], '_sdc_source_bucket': 'com-stitchdata-prod-circleci-assets', '_sdc_source_file': 'tap_tester/test_csv_with_empty_lines.csv', '_sdc_source_lineno': 5}
        ]

        self.assertListEqual(expected_records, records)
Пример #8
0
    def test_run(self):
        conn_id = connections.ensure_connection(self)

        #run in check mode
        check_job_name = runner.run_check_mode(self, conn_id)

        #verify check  exit codes
        exit_status = menagerie.get_exit_status(conn_id, check_job_name)
        menagerie.verify_check_exit_status(self, exit_status, check_job_name)

        found_catalogs = menagerie.get_catalogs(conn_id)
        self.assertGreater(len(found_catalogs), 0, msg="unable to locate schemas for connection {}".format(conn_id))

        found_catalog_names = set(map(lambda c: c['tap_stream_id'], found_catalogs))

        diff = self.expected_check_streams().symmetric_difference( found_catalog_names )
        self.assertEqual(len(diff), 0, msg="discovered schemas do not match: {}".format(diff))
        print("discovered schemas are OK")

        #select all catalogs

        for c in found_catalogs:
            catalog_entry = menagerie.get_annotated_schema(conn_id, c['stream_id'])
            if c['stream_name'] in self.expected_sync_streams().keys():
                stream = c['stream_name']
                pks = self.expected_sync_streams()[stream]

                for pk in pks:
                    mdata = next((m for m in catalog_entry['metadata']
                                  if len(m['breadcrumb']) == 2 and m['breadcrumb'][1] == pk), None)
                    print("Validating inclusion on {}: {}".format(c['stream_name'], mdata))
                    self.assertTrue(mdata and mdata['metadata']['inclusion'] == 'automatic')

                connections.select_catalog_and_fields_via_metadata(conn_id, c, catalog_entry)

        #clear state
        menagerie.set_state(conn_id, {})

        sync_job_name = runner.run_sync_mode(self, conn_id)

        #verify tap and target exit codes
        exit_status = menagerie.get_exit_status(conn_id, sync_job_name)
        menagerie.verify_sync_exit_status(self, exit_status, sync_job_name)

        first_record_count_by_stream = runner.examine_target_output_file(self, conn_id, set(self.expected_sync_streams().keys()), self.expected_sync_streams())
        replicated_row_count =  reduce(lambda accum,c : accum + c, first_record_count_by_stream.values())
        self.assertGreater(replicated_row_count, 0, msg="failed to replicate any data: {}".format(first_record_count_by_stream))
        print("total replicated row count: {}".format(replicated_row_count))

        # Verify that automatic fields are all emitted with records
        synced_records = runner.get_records_from_target_output()
        for stream_name, data in synced_records.items():
            record_messages = [set(row['data'].keys()) for row in data['messages']]
            self.assertGreater(len(record_messages), 0, msg="stream {} did not sync any records.".format(stream_name))
            for record_keys in record_messages:
                self.assertEqual(self.expected_sync_streams().get(stream_name, set()) - record_keys, set())
Пример #9
0
    def test_run(self):

        conn_id = connections.ensure_connection(self, payload_hook=None)

        # Run the tap in check mode
        check_job_name = runner.run_check_mode(self, conn_id)

        # Verify the check's exit status
        exit_status = menagerie.get_exit_status(conn_id, check_job_name)
        menagerie.verify_check_exit_status(self, exit_status, check_job_name)

        # Verify that there are catalogs found
        found_catalogs = menagerie.get_catalogs(conn_id)
        self.assertGreater(
            len(found_catalogs),
            0,
            msg="unable to locate schemas for connection {}".format(conn_id))

        found_catalog_names = set(
            map(lambda c: c['tap_stream_id'], found_catalogs))
        subset = self.expected_check_streams().issubset(found_catalog_names)
        self.assertTrue(
            subset,
            msg="Expected check streams are not subset of discovered catalog")
        #
        # # Select some catalogs
        our_catalogs = [
            c for c in found_catalogs
            if c.get('tap_stream_id') in self.expected_sync_streams()
        ]
        for catalog in our_catalogs:
            schema = menagerie.get_annotated_schema(conn_id,
                                                    catalog['stream_id'])
            connections.select_catalog_and_fields_via_metadata(
                conn_id, catalog, schema, [], [])

        # # Verify that all streams sync at least one row for initial sync
        # # This test is also verifying access token expiration handling. If test fails with
        # # authentication error, refresh token was not replaced after expiring.
        menagerie.set_state(conn_id, {})
        sync_job_name = runner.run_sync_mode(self, conn_id)

        # # Verify tap and target exit codes
        exit_status = menagerie.get_exit_status(conn_id, sync_job_name)
        menagerie.verify_sync_exit_status(self, exit_status, sync_job_name)
        record_count_by_stream = runner.examine_target_output_file(
            self, conn_id, self.expected_sync_streams(), self.expected_pks())
        zero_count_streams = {
            k
            for k, v in record_count_by_stream.items() if v == 0
        }
        self.assertFalse(
            zero_count_streams,
            msg="The following streams did not sync any rows {}".format(
                zero_count_streams))
    def test_run(self):
        conn_id = connections.ensure_connection(self)

        # run in check mode
        check_job_name = runner.run_check_mode(self, conn_id)

        # verify check exit codes
        exit_status = menagerie.get_exit_status(conn_id, check_job_name)
        menagerie.verify_check_exit_status(self, exit_status, check_job_name)

        found_catalogs = menagerie.get_catalogs(conn_id)
        self.assertGreater(len(found_catalogs), 0, msg="unable to locate schemas for connection {}".format(conn_id))

        found_catalog_names = set(map(lambda c: c['tap_stream_id'], found_catalogs))

        diff = self.expected_check_streams().symmetric_difference( found_catalog_names )
        self.assertEqual(len(diff), 0, msg="discovered schemas do not match: {}".format(diff))
        print("discovered schemas are kosher")

        all_excluded_fields = {}
        # select all catalogs
        for c in found_catalogs:
            if c['stream_name'] == 'ads':
                continue

            discovered_schema = menagerie.get_annotated_schema(conn_id, c['stream_id'])['annotated-schema']
            all_excluded_fields[c['stream_name']] = list(set(discovered_schema.keys()) - self.expected_automatic_fields().get(c['stream_name'], set()))[:5]
            connections.select_catalog_and_fields_via_metadata(
                conn_id,
                c,
                discovered_schema,
                non_selected_fields=all_excluded_fields[c['stream_name']])

        # clear state
        menagerie.set_state(conn_id, {})

        sync_job_name = runner.run_sync_mode(self, conn_id)

        # verify tap and target exit codes
        exit_status = menagerie.get_exit_status(conn_id, sync_job_name)
        menagerie.verify_sync_exit_status(self, exit_status, sync_job_name)

        # This should be validating the the PKs are written in each record
        record_count_by_stream = runner.examine_target_output_file(self, conn_id, self.expected_sync_streams(), self.expected_pks())
        replicated_row_count =  reduce(lambda accum,c : accum + c, record_count_by_stream.values())
        self.assertGreater(replicated_row_count, 0, msg="failed to replicate any data: {}".format(record_count_by_stream))
        print("total replicated row count: {}".format(replicated_row_count))

        synced_records = runner.get_records_from_target_output()
        self.assertTrue('ads' not in synced_records.keys())
        for stream_name, data in synced_records.items():
            record_messages = [set(row['data'].keys()) for row in data['messages']]
            for record_keys in record_messages:
                # The intersection should be empty
                self.assertFalse(record_keys.intersection(all_excluded_fields[stream_name]))
Пример #11
0
    def starter(self):
        """
        Instantiate connection, run discovery, and initial sync.

        This entire process needs to retry if we get rate limited so that we are using a fresh connection
        and can test the activate version messages.
        """

        ##########################################################################
        ### Instantiate connection
        ##########################################################################
        self.conn_id = connections.ensure_connection(self)
        
        ##########################################################################
        ### Discovery without the backoff
        ##########################################################################
        check_job_name = runner.run_check_mode(self, self.conn_id)
        exit_status = menagerie.get_exit_status(self.conn_id, check_job_name)
        menagerie.verify_check_exit_status(self, exit_status, check_job_name)


        found_catalogs = menagerie.get_catalogs(self.conn_id)
        self.assertGreater(len(found_catalogs), 0, msg="unable to locate schemas for connection {}".format(self.conn_id))
        found_catalog_names = set(map(lambda c: c['stream_name'], found_catalogs))

        self.assertSetEqual(self.expected_streams(), found_catalog_names, msg="discovered schemas do not match")
        print("discovered schemas are OK")

        
        # table and field selection
        test_catalogs = [catalog for catalog in found_catalogs
                         if catalog.get('stream_name') in self.expected_test_streams]

        self.perform_and_verify_table_and_field_selection(
            self.conn_id, test_catalogs, select_all_fields=True,
        )

        ##########################################################################
        ### Initial sync without the backoff
        ##########################################################################
        sync_job_name = runner.run_sync_mode(self, self.conn_id)

        # Verify tap and target exit codes
        exit_status = menagerie.get_exit_status(self.conn_id, sync_job_name)
        menagerie.verify_sync_exit_status(self, exit_status, sync_job_name)

        self.record_count_by_stream_1 = runner.examine_target_output_file(
            self, self.conn_id, self.expected_streams(), self.expected_primary_keys())
        self.assertGreater(
            sum(self.record_count_by_stream_1.values()), 0,
            msg="failed to replicate any data: {}".format(self.record_count_by_stream_1)
        )
        print("total replicated row count: {}".format(sum(self.record_count_by_stream_1.values())))
    def test_run(self):
        conn_id = connections.ensure_connection(self)

        #run in check mode
        check_job_name = runner.run_check_mode(self, conn_id)

        #verify check exit codes
        exit_status = menagerie.get_exit_status(conn_id, check_job_name)
        menagerie.verify_check_exit_status(self, exit_status, check_job_name)

        found_catalogs = menagerie.get_catalogs(conn_id)
        self.assertGreater(
            len(found_catalogs),
            0,
            msg="unable to locate schemas for connection {}".format(conn_id))

        found_catalog_names = set(
            map(lambda c: c['tap_stream_id'], found_catalogs))

        diff = self.expected_check_streams().symmetric_difference(
            found_catalog_names)
        self.assertEqual(
            len(diff),
            0,
            msg="discovered schemas do not match: {}".format(diff))
        print("discovered schemas are kosher")

        # select all catalogs
        for c in found_catalogs:
            catalog_entry = menagerie.get_annotated_schema(
                conn_id, c['stream_id'])
            connections.select_catalog_via_metadata(conn_id, c, catalog_entry)

        # clear state
        menagerie.set_state(conn_id, {})

        sync_job_name = runner.run_sync_mode(self, conn_id)

        # verify tap and target exit codes
        exit_status = menagerie.get_exit_status(conn_id, sync_job_name)
        menagerie.verify_sync_exit_status(self, exit_status, sync_job_name)

        # This should be validating the the PKs are written in each record

        record_count_by_stream = runner.examine_target_output_file(
            self, conn_id, self.expected_sync_streams(), self.expected_pks())
        replicated_row_count = reduce(lambda accum, c: accum + c,
                                      record_count_by_stream.values())
        self.assertGreater(replicated_row_count,
                           0,
                           msg="failed to replicate any data: {}".format(
                               record_count_by_stream))
        print("total replicated row count: {}".format(replicated_row_count))
Пример #13
0
    def test_run(self):
        conn_id = connections.ensure_connection(self)

        # Run the tap in check mode
        check_job_name = runner.run_check_mode(self, conn_id)

        # Verify the check's exit status
        exit_status = menagerie.get_exit_status(conn_id, check_job_name)
        menagerie.verify_check_exit_status(self, exit_status, check_job_name)

        # Verify that there are catalogs found
        found_catalogs = menagerie.get_catalogs(conn_id)
        self.assertGreater(
            len(found_catalogs),
            0,
            msg="unable to locate schemas for connection {}".format(conn_id))

        found_catalog_names = set(
            map(lambda c: c['tap_stream_id'], found_catalogs))
        subset = self.expected_check_streams().issubset(found_catalog_names)
        self.assertTrue(
            subset,
            msg="Expected check streams are not subset of discovered catalog")

        # Select some catalogs
        our_catalogs = [
            c for c in found_catalogs
            if c.get('tap_stream_id') in self.expected_sync_streams()
        ]
        for catalog in our_catalogs:
            schema = menagerie.get_annotated_schema(conn_id,
                                                    catalog['stream_id'])
            connections.select_catalog_and_fields_via_metadata(
                conn_id, catalog, schema)

        # Clear State and run sync
        menagerie.set_state(conn_id, {})
        sync_job_name = runner.run_sync_mode(self, conn_id)

        # Verify tap and target exit codes
        exit_status = menagerie.get_exit_status(conn_id, sync_job_name)
        menagerie.verify_sync_exit_status(self, exit_status, sync_job_name)

        # Verify rows were synced
        record_count_by_stream = runner.examine_target_output_file(
            self, conn_id, self.expected_sync_streams(), self.expected_pks())
        replicated_row_count = reduce(lambda accum, c: accum + c,
                                      record_count_by_stream.values())
        self.assertGreater(replicated_row_count,
                           0,
                           msg="failed to replicate any data: {}".format(
                               record_count_by_stream))
        print("total replicated row count: {}".format(replicated_row_count))
    def test_run(self):
        conn_id = connections.ensure_connection(self)
        runner.run_check_mode(self, conn_id)

        found_catalogs = menagerie.get_catalogs(conn_id)
        self.select_found_catalogs(conn_id,
                                   found_catalogs,
                                   only_streams=self.streams_to_select())
        sync_job_name = runner.run_sync_mode(self, conn_id)

        # verify tap and target exit codes
        exit_status = menagerie.get_exit_status(conn_id, sync_job_name)
        menagerie.verify_sync_exit_status(self, exit_status, sync_job_name)
Пример #15
0
    def test_run(self):
        conn_id = self.create_connection()

        # Select our catalogs
        our_catalogs = [
            c for c in self.found_catalogs
            if c.get('tap_stream_id') in self.expected_sync_streams()
        ]
        for c in our_catalogs:
            c_annotated = menagerie.get_annotated_schema(
                conn_id, c['stream_id'])
            c_metadata = metadata.to_map(c_annotated['metadata'])
            connections.select_catalog_and_fields_via_metadata(
                conn_id, c, c_annotated, [], [])

        # Clear state before our run
        menagerie.set_state(conn_id, {})

        # Run a sync job using orchestrator
        sync_job_name = runner.run_sync_mode(self, conn_id)

        # Verify tap and target exit codes
        exit_status = menagerie.get_exit_status(conn_id, sync_job_name)
        menagerie.verify_sync_exit_status(self, exit_status, sync_job_name)

        # Verify actual rows were synced
        record_count_by_stream = runner.examine_target_output_file(
            self, conn_id, self.expected_sync_streams(), self.expected_pks())
        replicated_row_count = reduce(lambda accum, c: accum + c,
                                      record_count_by_stream.values())
        self.assertGreater(replicated_row_count,
                           0,
                           msg="failed to replicate any data: {}".format(
                               record_count_by_stream))
        print("total replicated row count: {}".format(replicated_row_count))

        # Ensure all records have a value for PK(s)
        records = runner.get_records_from_target_output()
        for stream in self.expected_sync_streams():
            messages = records.get(stream).get('messages')
            for m in messages:
                pk_set = self.expected_pks()[stream]
                for pk in pk_set:
                    self.assertIsNotNone(m.get('data', {}).get(pk),
                                         msg="oh no! {}".format(m))

        bookmarks = menagerie.get_state(conn_id)['bookmarks']

        self.assertTrue('orders' in bookmarks)
    def test_run(self):
        conn_id = connections.ensure_connection(self)

        #run in check mode
        check_job_name = runner.run_check_mode(self, conn_id)

        #verify check  exit codes
        exit_status = menagerie.get_exit_status(conn_id, check_job_name)
        menagerie.verify_check_exit_status(self, exit_status, check_job_name)

        found_catalogs = menagerie.get_catalogs(conn_id)
        self.assertGreater(len(found_catalogs), 0, msg="unable to locate schemas for connection {}".format(conn_id))

        found_catalog_names = set(map(lambda c: c['tap_stream_id'], found_catalogs))

        diff = self.expected_check_streams().symmetric_difference( found_catalog_names )
        self.assertEqual(len(diff), 0, msg="discovered schemas do not match: {}".format(diff))
        print("discovered schemas are kosher")

        #select all catalogs
        #selected_catalogs = list(map(lambda catalog: self.perform_field_selection(conn_id, catalog), found_catalogs))
        #menagerie.post_annotated_catalogs(conn_id, selected_catalogs)

        for c in found_catalogs:
            connections.select_catalog_and_fields_via_metadata(conn_id, c,
                                                               menagerie.get_annotated_schema(conn_id, c['stream_id']))

        #clear state
        menagerie.set_state(conn_id, {})

        sync_job_name = runner.run_sync_mode(self, conn_id)

        #verify tap and target exit codes
        exit_status = menagerie.get_exit_status(conn_id, sync_job_name)
        menagerie.verify_sync_exit_status(self, exit_status, sync_job_name)

        record_count_by_stream = runner.examine_target_output_file(self, conn_id, self.expected_sync_streams(), self.expected_pks())
        replicated_row_count =  reduce(lambda accum,c : accum + c, record_count_by_stream.values())
        self.assertGreater(replicated_row_count, 0, msg="failed to replicate any data: {}".format(record_count_by_stream))
        print("total replicated row count: {}".format(replicated_row_count))

        # bookmarks for the 4 streams should be 2015-03-16
        states = menagerie.get_state(conn_id)["bookmarks"]
        end_date = self.get_properties()["end_date"].split()[0]
        for k, v in states.items():
            if "insights" in k:
                bm_date = v.get("date_start")
                self.assertEqual(end_date, bm_date)
        print("bookmarks match end_date of {}".format(end_date))
Пример #17
0
    def run_sync(self, conn_id):
        """
        Run a sync job and make sure it exited properly.
        Return a dictionary with keys of streams synced
        and values of records synced for each stream
        """
        # Run a sync job using orchestrator
        sync_job_name = runner.run_sync_mode(self, conn_id)

        # Verify tap and target exit codes
        exit_status = menagerie.get_exit_status(conn_id, sync_job_name)
        menagerie.verify_sync_exit_status(self, exit_status, sync_job_name)

        # Verify actual rows were synced
        sync_record_count = runner.examine_target_output_file(
            self, conn_id, self.expected_sync_streams(), self.expected_pks())
        return sync_record_count
Пример #18
0
    def run_and_verify_sync(self, conn_id):
        sync_job_name = runner.run_sync_mode(self, conn_id)

        # verify tap and target exit codes
        exit_status = menagerie.get_exit_status(conn_id, sync_job_name)
        menagerie.verify_sync_exit_status(self, exit_status, sync_job_name)

        sync_record_count = runner.examine_target_output_file(
            self, conn_id, self.expected_check_streams(),
            self.expected_primary_keys())

        self.assertGreater(
            sum(sync_record_count.values()),
            0,
            msg="failed to replicate any data: {}".format(sync_record_count))
        print("total replicated row count: {}".format(
            sum(sync_record_count.values())))

        return sync_record_count
Пример #19
0
    def run_sync(self, expected_streams):
        conn_id = connections.ensure_connection(self)

        found_catalogs = self.run_and_verify_check_mode(conn_id)

        # table and field selection
        test_catalogs = [
            catalog for catalog in found_catalogs
            if catalog.get('stream_name') in expected_streams
        ]

        self.perform_and_verify_table_and_field_selection(
            conn_id, test_catalogs)

        sync_job_name = runner.run_sync_mode(self, conn_id)

        # Verify tap and target exit codes
        exit_status = menagerie.get_exit_status(conn_id, sync_job_name)
        menagerie.verify_sync_exit_status(self, exit_status, sync_job_name)
        return runner.get_upserts_from_target_output()
Пример #20
0
    def run_and_verify_sync(self, conn_id, state):
        """
        Run a sync job and make sure it exited properly.
        Return a dictionary with keys of streams synced
        and values of records synced for each stream
        """
        # reset state to the state at the start of the sync in case we got interrupted
        menagerie.set_state(conn_id, state)

        # Run a sync job using orchestrator
        sync_job_name = runner.run_sync_mode(self, conn_id)

        # Verify tap and target exit codes
        exit_status = menagerie.get_exit_status(conn_id, sync_job_name)
        try:
            menagerie.verify_sync_exit_status(self, exit_status, sync_job_name)
        except AssertionError as e:
            if exit_status['discovery_error_message'] or exit_status[
                    'tap_error_message']:
                print(
                    "*******************RETRYING SYNC FOR TAP/DISCOVERY FAILURE*******************"
                )
                raise RetryableTapError(e)

            raise

        # Verify actual rows were synced
        sync_record_count = runner.examine_target_output_file(
            self, conn_id, self.expected_streams(),
            self.expected_primary_keys())
        self.assertGreater(
            sum(sync_record_count.values()),
            0,
            msg="failed to replicate any data: {}".format(sync_record_count))
        print("total replicated row count: {}".format(
            sum(sync_record_count.values())))

        return sync_record_count
Пример #21
0
        def run_and_verify_sync(self, conn_id):
            """
            Clear the connections state in menagerie and Run a Sync.
            Verify the exit code following the sync.

            Return the connection id and record count by stream
            """
            #clear state
            menagerie.set_state(conn_id, {})

            # run sync
            sync_job_name = runner.run_sync_mode(self, conn_id)

            # Verify tap exit codes
            exit_status = menagerie.get_exit_status(conn_id, sync_job_name)
            menagerie.verify_sync_exit_status(self, exit_status, sync_job_name)

            # read target output
            first_record_count_by_stream = runner.examine_target_output_file(
                self, conn_id, self.expected_streams(),
                self.expected_primary_keys())

            return first_record_count_by_stream
Пример #22
0
    def test_run(self):
        conn_id = self.ensure_connection()

        # Run in check mode
        check_job_name = runner.run_check_mode(self, conn_id)

        # Verify check exit codes
        exit_status = menagerie.get_exit_status(conn_id, check_job_name)
        menagerie.verify_check_exit_status(self, exit_status, check_job_name)

        found_catalogs = menagerie.get_catalogs(conn_id)
        self.assertGreater(
            len(found_catalogs),
            0,
            msg="unable to locate schemas for connection {}".format(conn_id))

        # Select all tables and fields
        self.select_all_streams_and_fields(conn_id, found_catalogs)

        # Run a sync job using orchestrator
        sync_job_name = runner.run_sync_mode(self, conn_id)

        # Verify tap and target exit codes
        exit_status = menagerie.get_exit_status(conn_id, sync_job_name)
        menagerie.verify_sync_exit_status(self, exit_status, sync_job_name)

        # Verify actual rows were synced
        sync_record_count = runner.examine_target_output_file(
            self, conn_id, self.expected_streams(),
            self.expected_primary_keys())

        # Examine target output
        for stream in self.expected_streams():
            with self.subTest(stream=stream):
                # Each stream should have 1 or more records returned
                self.assertGreaterEqual(sync_record_count[stream], 1)
Пример #23
0
    def test_run(self):
        conn_id = connections.ensure_connection(self)

        # run in check mode
        check_job_name = runner.run_check_mode(self, conn_id)

        # verify check  exit codes
        exit_status = menagerie.get_exit_status(conn_id, check_job_name)
        menagerie.verify_check_exit_status(self, exit_status, check_job_name)

        # verify the tap discovered the right streams
        found_catalogs = [
            fc for fc in menagerie.get_catalogs(conn_id)
            if fc['tap_stream_id'] in self.expected_check_streams()
        ]

        self.assertEqual(
            len(found_catalogs),
            1,
            msg="unable to locate schemas for connection {}".format(conn_id))

        found_catalog_names = set(
            map(lambda c: c['tap_stream_id'], found_catalogs))
        diff = self.expected_check_streams().symmetric_difference(
            found_catalog_names)
        self.assertEqual(
            len(diff),
            0,
            msg="discovered schemas do not match: {}".format(diff))

        # verify that persisted streams have the correct properties
        chicken_catalog = found_catalogs[0]

        self.assertEqual('chicken_view', chicken_catalog['stream_name'])
        print("discovered streams are correct")

        print('checking discoverd metadata for ROOT-CHICKEN_VIEW')
        md = menagerie.get_annotated_schema(
            conn_id, chicken_catalog['stream_id'])['metadata']

        self.assertEqual(
            {
                (): {
                    'database-name': 'postgres',
                    'is-view': True,
                    'row-count': 0,
                    'schema-name': 'public',
                    'table-key-properties': []
                },
                ('properties', 'fk_id'): {
                    'inclusion': 'available',
                    'sql-datatype': 'bigint',
                    'selected-by-default': True
                },
                ('properties', 'name'): {
                    'inclusion': 'available',
                    'sql-datatype': 'character varying',
                    'selected-by-default': True
                },
                ('properties', 'age'): {
                    'inclusion': 'available',
                    'sql-datatype': 'integer',
                    'selected-by-default': True
                },
                ('properties', 'size'): {
                    'inclusion': 'available',
                    'sql-datatype': 'character varying',
                    'selected-by-default': True
                },
                ('properties', 'id'): {
                    'inclusion': 'available',
                    'sql-datatype': 'integer',
                    'selected-by-default': True
                }
            }, metadata.to_map(md))

        # 'ID' selected as view-key-properties
        replication_md = [{
            "breadcrumb": [],
            "metadata": {
                'replication-key': None,
                "replication-method": "FULL_TABLE",
                'view-key-properties': ["id"]
            }
        }]

        connections.select_catalog_and_fields_via_metadata(
            conn_id, chicken_catalog,
            menagerie.get_annotated_schema(conn_id,
                                           chicken_catalog['stream_id']),
            replication_md)

        # clear state
        menagerie.set_state(conn_id, {})

        sync_job_name = runner.run_sync_mode(self, conn_id)

        # verify tap and target exit codes
        exit_status = menagerie.get_exit_status(conn_id, sync_job_name)
        menagerie.verify_sync_exit_status(self, exit_status, sync_job_name)

        record_count_by_stream = runner.examine_target_output_file(
            self, conn_id, self.expected_sync_streams(), self.expected_pks())

        self.assertEqual(record_count_by_stream, {'chicken_view': 1})
        records_by_stream = runner.get_records_from_target_output()

        table_version = records_by_stream['chicken_view']['table_version']
        self.assertEqual(
            records_by_stream['chicken_view']['messages'][0]['action'],
            'activate_version')
        self.assertEqual(
            records_by_stream['chicken_view']['messages'][1]['action'],
            'upsert')
        self.assertEqual(
            records_by_stream['chicken_view']['messages'][2]['action'],
            'activate_version')

        # verifications about individual records
        for stream, recs in records_by_stream.items():
            # verify the persisted schema was correct
            self.assertEqual(
                recs['schema'],
                expected_schemas[stream],
                msg=
                "Persisted schema did not match expected schema for stream `{}`."
                .format(stream))

        actual_chicken_record = records_by_stream['chicken_view']['messages'][
            1]['data']

        expected_chicken_record = {
            'id': 1,
            'fk_id': 1,
            'name': 'fred',
            'age': 99,
            'size': 'big'
        }
        self.assertEqual(
            actual_chicken_record,
            expected_chicken_record,
            msg=
            "Expected `various_types` upsert record data to be {}, but target output {}"
            .format(expected_chicken_record, actual_chicken_record))

        print("records are correct")

        # verify state and bookmarks
        state = menagerie.get_state(conn_id)

        chicken_bookmark = state['bookmarks']['postgres-public-chicken_view']
        self.assertIsNone(state['currently_syncing'],
                          msg="expected state's currently_syncing to be None")
        self.assertEqual(
            chicken_bookmark['version'],
            table_version,
            msg="expected bookmark for stream ROOT-CHICKEN to match version")
Пример #24
0
    def test_run(self):
        conn_id = connections.ensure_connection(self)

        # run in check mode
        check_job_name = runner.run_check_mode(self, conn_id)

        # verify check  exit codes
        exit_status = menagerie.get_exit_status(conn_id, check_job_name)
        menagerie.verify_check_exit_status(self, exit_status, check_job_name)

        # verify the tap discovered the right streams
        found_catalogs = [
            fc for fc in menagerie.get_catalogs(conn_id)
            if fc['tap_stream_id'] in self.expected_check_streams()
        ]
        self.assertGreater(
            len(found_catalogs),
            0,
            msg="unable to locate schemas for connection {}".format(conn_id))

        found_catalog_names = set(
            map(lambda c: c['tap_stream_id'], found_catalogs))

        diff = self.expected_check_streams().symmetric_difference(
            found_catalog_names)
        self.assertEqual(
            len(diff),
            0,
            msg="discovered schemas do not match: {}".format(diff))

        # verify that persisted streams have the correct properties
        for c in found_catalogs:
            catalog_props_to_check = ['stream_name', 'tap_stream_id']
            stream = c['stream_name']

            for prop in catalog_props_to_check:
                self.assertEqual(
                    c[prop],
                    expected_catalogs[stream][prop],
                    msg=
                    "unexpected stream catalog property `{}` for stream `{}`: `{}` != `{}`"
                    .format(prop, stream, expected_catalogs[stream][prop],
                            c[prop]))

        print("discovered streams are correct")

        print('checking discoverd metadata for tap_tester_mysql_0-incremental')
        incremental_catalog = [
            c for c in found_catalogs
            if c['tap_stream_id'] == 'tap_tester_mysql_0-incremental'
        ][0]
        md = menagerie.get_annotated_schema(
            conn_id, incremental_catalog['stream_id'])['metadata']

        incremental_stream_metadata = {
            'database-name': 'tap_tester_mysql_0',
            'row-count': 3,
            'is-view': False,
            'selected-by-default': False,
            'table-key-properties': ['c_pk']
        }

        self.assertEqual(
            sorted(md, key=lambda x: x['breadcrumb']),
            [{
                'breadcrumb': [],
                'metadata': incremental_stream_metadata
            }, {
                'breadcrumb': ['properties', 'c_dt'],
                'metadata': {
                    'selected-by-default': True,
                    'sql-datatype': 'datetime'
                }
            }, {
                'breadcrumb': ['properties', 'c_pk'],
                'metadata': {
                    'selected-by-default': True,
                    'sql-datatype': 'int(11)'
                }
            }, {
                'breadcrumb': ['properties', 'c_varchar'],
                'metadata': {
                    'selected-by-default': True,
                    'sql-datatype': 'varchar(255)'
                }
            }, {
                'breadcrumb': ['properties', 'c_varchar_to_deselect'],
                'metadata': {
                    'selected-by-default': True,
                    'sql-datatype': 'varchar(255)'
                }
            }])

        print('checking discovered metadata for tap_tester_mysql_1-view')
        view_catalog = [
            c for c in found_catalogs
            if c['tap_stream_id'] == 'tap_tester_mysql_1-view'
        ][0]
        view_catalog_key_properties_md = [{
            'breadcrumb': [],
            'metadata': {
                'view-key-properties': ['c_pk']
            }
        }]

        connections.set_non_discoverable_metadata(
            conn_id, view_catalog,
            menagerie.get_annotated_schema(conn_id, view_catalog['stream_id']),
            view_catalog_key_properties_md)
        md = menagerie.get_annotated_schema(
            conn_id, view_catalog['stream_id'])['metadata']

        view_stream_metadata = {
            'database-name': 'tap_tester_mysql_1',
            'is-view': True,
            'selected-by-default': False,
            'view-key-properties': ['c_pk']
        }

        self.assertEqual(sorted(md, key=lambda x: x['breadcrumb']),
                         [{
                             'breadcrumb': [],
                             'metadata': view_stream_metadata
                         }, {
                             'breadcrumb': ['properties', 'c_pk'],
                             'metadata': {
                                 'selected-by-default': True,
                                 'sql-datatype': 'int(11)'
                             }
                         }, {
                             'breadcrumb': ['properties', 'c_varchar'],
                             'metadata': {
                                 'selected-by-default': True,
                                 'sql-datatype': 'varchar(255)'
                             }
                         }])

        #No selected-by-default MD for c_year because it is an unsupported type
        various_types_catalog = [
            c for c in found_catalogs
            if c['tap_stream_id'] == 'tap_tester_mysql_0-various_types'
        ][0]
        md = menagerie.get_annotated_schema(
            conn_id, various_types_catalog['stream_id'])['metadata']
        c_year_md = [
            x for x in md if x['breadcrumb'] == ['properties', 'c_year']
        ]
        self.assertEqual(c_year_md, [{
            'breadcrumb': ['properties', 'c_year'],
            'metadata': {
                'selected-by-default': False,
                'sql-datatype': 'year(4)'
            }
        }])

        ##select_simple_example
        catalogs_to_select = [
            c for c in found_catalogs
            if c['tap_stream_id'] != 'tap_tester_mysql_0-simple_example'
        ]

        for a_catalog in catalogs_to_select:
            additional_md = []
            unselected_fields = []
            if a_catalog['tap_stream_id'] == 'tap_tester_mysql_0-incremental':
                additional_md = [{
                    "breadcrumb": [],
                    "metadata": {
                        'replication-key': 'c_dt',
                        'replication-method': 'INCREMENTAL'
                    }
                }]
                unselected_fields = ['c_varchar_to_deselect']

            elif a_catalog['tap_stream_id'] == 'tap_tester_mysql_1-view':
                additional_md = [{
                    "breadcrumb": [],
                    "metadata": {
                        'view-key-properties': ['c_pk'],
                        'replication-method': 'FULL_TABLE'
                    }
                }]
            else:
                additional_md = [{
                    "breadcrumb": [],
                    "metadata": {
                        'replication-method': 'FULL_TABLE'
                    }
                }]

            selected_metadata = connections.select_catalog_and_fields_via_metadata(
                conn_id, a_catalog,
                menagerie.get_annotated_schema(conn_id,
                                               a_catalog['stream_id']),
                additional_md, unselected_fields)
        # clear state
        menagerie.set_state(conn_id, {})
        sync_job_name = runner.run_sync_mode(self, conn_id)

        # verify tap and target exit codes
        exit_status = menagerie.get_exit_status(conn_id, sync_job_name)
        menagerie.verify_sync_exit_status(self, exit_status, sync_job_name)
        record_count_by_stream = runner.examine_target_output_file(
            self, conn_id, self.expected_sync_streams(), self.expected_pks())
        replicated_row_count = reduce(lambda accum, c: accum + c,
                                      record_count_by_stream.values())
        expected_row_count = 8  # {'my_isam': 1, 'various_types': 3, 'incremental': 3, 'view': 1}
        self.assertEqual(
            replicated_row_count,
            expected_row_count,
            msg="failed to replicate correct number of rows: {}".format(
                record_count_by_stream))
        print("total replicated row count: {}".format(replicated_row_count))

        records_by_stream = runner.get_records_from_target_output()

        # verifications about individual records
        for stream, recs in records_by_stream.items():
            # verify that activate version messages were sent in the proper position
            self.assertEqual(
                recs['messages'][0]['action'],
                'activate_version',
                msg=
                "Expected first message sent for stream `{}` to have action `activate_version`"
                .format(stream))

            # verify the persisted schema was correct
            self.assertEqual(
                recs['schema'],
                expected_schemas[stream],
                msg=
                "Persisted schema did not match expected schema for stream `{}`."
                .format(stream))

        # verify that the target output the proper numeric and date representations
        expected_various_types_records = [{
            'c_time':
            '1970-01-01T12:34:56.000000Z',
            'c_mediumint':
            8388607,
            'c_smallint':
            32767,
            'c_tinyint':
            127,
            'c_date':
            '2017-09-13T00:00:00.000000Z',
            'c_bigint':
            9223372036854775807,
            'c_decimal':
            -1,
            'c_int':
            2147483647,
            'c_bit':
            True,
            'c_decimal_2':
            Decimal('123456789.0'),
            'c_pk':
            1,
            'c_double':
            Decimal("1.234"),
            'c_float':
            Decimal("1.234"),
            'c_decimal_2_unsigned':
            Decimal("1.23"),
            'c_tinyint_1':
            True
        }, {
            'c_time':
            '1970-01-01T12:34:57.000000Z',
            'c_mediumint':
            -8388608,
            'c_smallint':
            -32768,
            'c_tinyint':
            -128,
            'c_date':
            '2017-09-14T00:00:00.000000Z',
            'c_bigint':
            -9223372036854775808,
            'c_decimal':
            0,
            'c_int':
            -2147483648,
            'c_bit':
            False,
            'c_decimal_2':
            Decimal("123456790.0"),
            'c_pk':
            2,
            'c_double':
            Decimal("2.234"),
            'c_float':
            Decimal("2.234"),
            'c_decimal_2_unsigned':
            Decimal("0.23"),
            'c_tinyint_1':
            False
        }, {
            'c_time':
            '1970-01-01T12:34:57.000000Z',
            'c_mediumint':
            -8388608,
            'c_smallint':
            -32768,
            'c_tinyint':
            -128,
            'c_date':
            '2017-09-14T00:00:00.000000Z',
            'c_bigint':
            -9223372036854775808,
            'c_decimal':
            0,
            'c_int':
            -2147483648,
            'c_bit':
            None,
            'c_decimal_2':
            Decimal("123456790.0"),
            'c_pk':
            3,
            'c_double':
            Decimal("2.234"),
            'c_float':
            Decimal("2.234"),
            'c_decimal_2_unsigned':
            Decimal("0.23"),
            'c_tinyint_1':
            None
        }]

        actual_various_types_records = [
            r['data']
            for r in records_by_stream['various_types']['messages'][1:4]
        ]

        self.assertEqual(
            actual_various_types_records,
            expected_various_types_records,
            msg=
            "Expected `various_types` upsert record data to be {}, but target output {}"
            .format(expected_various_types_records,
                    actual_various_types_records))

        # verify that deselected property was not output
        expected_incremental_record = {
            'c_pk': 1,
            'c_dt': '2017-01-01T00:00:00.000000Z',
            'c_varchar': 'a'
        }

        actual_incremental_record = records_by_stream['incremental'][
            'messages'][1]['data']

        self.assertEqual(
            actual_incremental_record,
            expected_incremental_record,
            msg=
            "Expected first `incremental` upsert record data to be {}, but target output {}"
            .format(expected_incremental_record, actual_incremental_record))

        print("records are correct")

        # verify state and bookmarks
        state = menagerie.get_state(conn_id)
        bookmarks = state['bookmarks']
        self.assertIsNone(state['currently_syncing'],
                          msg="expected state's currently_syncing to be None")
        for k, v in bookmarks.items():
            if k == 'tap_tester_mysql_0-incremental':
                self.assertIsNotNone(
                    v['version'],
                    msg="expected bookmark for stream `{}` to have a version set"
                    .format(k))
                self.assertEqual(
                    v['replication_key_value'],
                    '2017-01-01T00:00:02.000000Z',
                    msg=
                    "incorrect replication_key_value in bookmark for stream `{}`"
                    .format(k))
                self.assertEqual(
                    v['replication_key'],
                    'c_dt',
                    msg=
                    "incorrect replication_key specified in bookmark for stream `{}`"
                    .format(k))
            else:
                self.assertFalse(
                    'version' in v,
                    msg=
                    "expected bookmark for stream `{}` to not have a version key"
                    .format(k))
                self.assertTrue(
                    'initial_full_table_complete' in v,
                    msg=
                    "expected bookmark for stream `{}` to have a true initial_full_table_complete key"
                    .format(k))
        print("state and bookmarks are correct")

        incremental_table_initial_table_version = bookmarks[
            'tap_tester_mysql_0-incremental']['version']

        #----------------------------------------------------------------------
        # invoke the sync job again after some modifications
        #----------------------------------------------------------------------

        print("adding a column to an existing table in the source db")
        connection = db_utils.get_db_connection(self.get_properties(),
                                                self.get_credentials())

        with connection.cursor() as cursor:
            add_column_sql = '''
                ALTER TABLE tap_tester_mysql_0.incremental
                  ADD COLUMN favorite_number INTEGER;
                INSERT INTO tap_tester_mysql_0.incremental VALUES (4, '4', '2017-01-01 00:00:03', 'yeehaw', 999);
            '''
            cursor.execute(add_column_sql)

        # run in check mode
        check_job_name = runner.run_check_mode(self, conn_id)

        # verify check  exit codes
        exit_status = menagerie.get_exit_status(conn_id, check_job_name)
        menagerie.verify_check_exit_status(self, exit_status, check_job_name)

        # verify the tap discovered the right streams
        found_catalogs = [
            fc for fc in menagerie.get_catalogs(conn_id)
            if fc['tap_stream_id'] in self.expected_check_streams()
        ]
        self.assertGreater(
            len(found_catalogs),
            0,
            msg="unable to locate schemas for connection {}".format(conn_id))

        found_catalog_names = set(
            map(lambda c: c['tap_stream_id'], found_catalogs))

        diff = self.expected_check_streams().symmetric_difference(
            found_catalog_names)
        self.assertEqual(
            len(diff),
            0,
            msg="discovered schemas do not match: {}".format(diff))

        sync_job_name = runner.run_sync_mode(self, conn_id)

        # verify tap and target exit codes
        exit_status = menagerie.get_exit_status(conn_id, sync_job_name)
        menagerie.verify_sync_exit_status(self, exit_status, sync_job_name)

        record_count_by_stream = runner.examine_target_output_file(
            self, conn_id, self.expected_sync_streams(), self.expected_pks())
        replicated_row_count = reduce(lambda accum, c: accum + c,
                                      record_count_by_stream.values())
        expected_row_count = 7  # {'my_isam': 1, 'various_types': 3, 'incremental': 2, 'view': 1}
        self.assertEqual(
            replicated_row_count,
            expected_row_count,
            msg="failed to replicate correct number of rows: {}".format(
                record_count_by_stream))
        print("total replicated row count: {}".format(replicated_row_count))

        records_by_stream = runner.get_records_from_target_output()

        expected_schema_of_new_column = {
            'maximum': 2147483647,
            'selected': True,
            'inclusion': 'available',
            'type': ['null', 'integer'],
            'minimum': -2147483648
        }

        # verifications about individual records
        for stream, recs in records_by_stream.items():
            # verify that a activate version messages were sent in the proper position
            if stream == 'incremental':
                self.assertEqual(
                    records_by_stream[stream]['messages'][0]['action'],
                    'activate_version',
                    msg=
                    "Expected first message sent for stream `{}` not to have action `activate_version`"
                    .format(stream))
                expected_schema_of_new_column = {
                    'maximum': 2147483647,
                    'inclusion': 'available',
                    'type': ['null', 'integer'],
                    'minimum': -2147483648
                }
                self.assertEqual(
                    records_by_stream[stream]['schema']['properties']
                    ['favorite_number'],
                    expected_schema_of_new_column,
                    msg=
                    "Expected newly-added column to be present in schema for stream `{}`, but it was not."
                    .format(stream))
            else:
                self.assertEqual(
                    records_by_stream[stream]['messages'][0]['action'],
                    'upsert',
                    msg=
                    "Expected first message sent for stream `{}` to have action `upsert`"
                    .format(stream))
                self.assertEqual(
                    records_by_stream[stream]['messages'][-1]['action'],
                    'activate_version',
                    msg=
                    "Expected last message sent for stream `{}` to have action `activate_version`"
                    .format(stream))

        state = menagerie.get_state(conn_id)
        bookmarks = state['bookmarks']
        self.assertIsNone(state['currently_syncing'],
                          msg="expected state's currently_syncing to be None")
        for k, v in bookmarks.items():
            if k == 'tap_tester_mysql_0-incremental':
                self.assertIsNotNone(
                    v['version'],
                    msg="expected bookmark for stream `{}` to have a version set"
                    .format(k))
                self.assertEqual(
                    v['replication_key_value'],
                    '2017-01-01T00:00:03.000000Z',
                    msg=
                    "incorrect replication_key_value in bookmark for stream `{}`"
                    .format(k))
                self.assertEqual(
                    v['replication_key'],
                    'c_dt',
                    msg=
                    "incorrect replication_key specified in bookmark for stream `{}`"
                    .format(k))
            else:
                self.assertFalse(
                    'version' in v,
                    msg=
                    "expected bookmark for stream `{}` to not have a version key"
                    .format(k))
                self.assertTrue(
                    'initial_full_table_complete' in v,
                    msg=
                    "expected bookmark for stream `{}` to have a true initial_full_table_complete key"
                    .format(k))

        print("state and bookmarks are correct")

        # verify incremental table_version didn't change
        incremental_table_new_table_version = bookmarks[
            'tap_tester_mysql_0-incremental']['version']

        self.assertEqual(
            incremental_table_initial_table_version,
            incremental_table_new_table_version,
            msg=
            "Expected incrementally-replicated table's table_version to remain unchanged over multiple invocations."
        )
Пример #25
0
    def test_run(self):
        conn_id = connections.ensure_connection(self)

        # run in check mode
        check_job_name = runner.run_check_mode(self, conn_id)

        # verify check  exit codes
        exit_status = menagerie.get_exit_status(conn_id, check_job_name)
        menagerie.verify_check_exit_status(self, exit_status, check_job_name)

        # verify the tap discovered the right streams
        found_catalogs = [
            fc for fc in menagerie.get_catalogs(conn_id)
            if fc['tap_stream_id'] in self.expected_check_streams()
        ]

        self.assertGreaterEqual(
            len(found_catalogs),
            2,
            msg="unable to locate schemas for connection {}".format(conn_id))

        found_catalog_names = set(
            map(lambda c: c['tap_stream_id'], found_catalogs))
        diff = self.expected_check_streams().symmetric_difference(
            found_catalog_names)
        self.assertEqual(
            len(diff),
            0,
            msg="discovered schemas do not match: {}".format(diff))

        # verify that persisted streams have the correct properties

        test_catalog_cows = list(
            filter(
                lambda c: c['stream_name'] ==
                'postgres_logical_replication_test_cows', found_catalogs))[0]
        self.assertEqual('postgres_logical_replication_test_cows',
                         test_catalog_cows['stream_name'])

        test_catalog_chickens = list(
            filter(
                lambda c: c['stream_name'
                            ] == 'postgres_logical_replication_test_chickens',
                found_catalogs))[0]
        self.assertEqual('postgres_logical_replication_test_chickens',
                         test_catalog_chickens['stream_name'])
        print("discovered streams are correct")

        additional_md = [{
            "breadcrumb": [],
            "metadata": {
                'replication-method': 'LOG_BASED'
            }
        }]
        connections.select_catalog_and_fields_via_metadata(
            conn_id, test_catalog_cows,
            menagerie.get_annotated_schema(conn_id,
                                           test_catalog_cows['stream_id']),
            additional_md)
        connections.select_catalog_and_fields_via_metadata(
            conn_id, test_catalog_chickens,
            menagerie.get_annotated_schema(conn_id,
                                           test_catalog_chickens['stream_id']),
            additional_md)

        # clear state
        menagerie.set_state(conn_id, {})

        sync_job_name = runner.run_sync_mode(self, conn_id)

        # verify tap and target exit codes
        exit_status = menagerie.get_exit_status(conn_id, sync_job_name)
        menagerie.verify_sync_exit_status(self, exit_status, sync_job_name)

        record_count_by_stream = runner.examine_target_output_file(
            self, conn_id, self.expected_sync_streams(), self.expected_pks())

        self.assertEqual(
            record_count_by_stream, {
                'postgres_logical_replication_test_cows': 1,
                'postgres_logical_replication_test_chickens': 1
            })
        records_by_stream = runner.get_records_from_target_output()

        table_version_cows = records_by_stream[
            'postgres_logical_replication_test_cows']['table_version']
        self.assertEqual(
            records_by_stream['postgres_logical_replication_test_cows']
            ['messages'][0]['action'], 'activate_version')
        self.assertEqual(
            records_by_stream['postgres_logical_replication_test_cows']
            ['messages'][1]['action'], 'upsert')
        self.assertEqual(
            records_by_stream['postgres_logical_replication_test_cows']
            ['messages'][2]['action'], 'activate_version')

        table_version_chickens = records_by_stream[
            'postgres_logical_replication_test_chickens']['table_version']
        self.assertEqual(
            records_by_stream['postgres_logical_replication_test_chickens']
            ['messages'][0]['action'], 'activate_version')
        self.assertEqual(
            records_by_stream['postgres_logical_replication_test_chickens']
            ['messages'][1]['action'], 'upsert')
        self.assertEqual(
            records_by_stream['postgres_logical_replication_test_chickens']
            ['messages'][2]['action'], 'activate_version')

        # verify state and bookmarks
        state = menagerie.get_state(conn_id)
        self.assertIsNone(state['currently_syncing'],
                          msg="expected state's currently_syncing to be None")

        bookmark_cows = state['bookmarks'][
            'dev-public-postgres_logical_replication_test_cows']
        self.assertIsNotNone(bookmark_cows['lsn'],
                             msg="expected bookmark for stream to have an lsn")
        lsn_cows_1 = bookmark_cows['lsn']
        self.assertEqual(bookmark_cows['version'],
                         table_version_cows,
                         msg="expected bookmark for stream to match version")

        bookmark_chickens = state['bookmarks'][
            'dev-public-postgres_logical_replication_test_chickens']
        self.assertIsNotNone(bookmark_chickens['lsn'],
                             msg="expected bookmark for stream to have an lsn")
        lsn_chickens_1 = bookmark_chickens['lsn']
        self.assertEqual(bookmark_chickens['version'],
                         table_version_chickens,
                         msg="expected bookmark for stream to match version")

        #----------------------------------------------------------------------
        # invoke the sync job again after adding records
        #----------------------------------------------------------------------
        print("inserting 2 more cows and 2 more chickens")

        with db_utils.get_test_connection('dev') as conn:
            conn.autocommit = True
            with conn.cursor() as cur:
                # insert another cow
                self.cows_rec_2 = {'cow_name': "betty cow", 'cow_age': 21}
                insert_record(cur, test_table_name_cows, self.cows_rec_2)
                # update that cow's expected values
                self.cows_rec_2['id'] = 2
                self.cows_rec_2['_sdc_deleted_at'] = None

                # insert another chicken
                self.chicken_rec_2 = {
                    'chicken_name': "burt chicken",
                    'chicken_age': 14
                }
                insert_record(cur, test_table_name_chickens,
                              self.chicken_rec_2)
                # update that cow's expected values
                self.chicken_rec_2['id'] = 2
                self.chicken_rec_2['_sdc_deleted_at'] = None

                # and repeat...

                self.cows_rec_3 = {'cow_name': "cindy cow", 'cow_age': 10}
                insert_record(cur, test_table_name_cows, self.cows_rec_3)
                self.cows_rec_3['id'] = 3
                self.cows_rec_3['_sdc_deleted_at'] = None

                self.chicken_rec_3 = {
                    'chicken_name': "carl chicken",
                    'chicken_age': 4
                }
                insert_record(cur, test_table_name_chickens,
                              self.chicken_rec_3)
                self.chicken_rec_3['id'] = 3
                self.chicken_rec_3['_sdc_deleted_at'] = None

        sync_job_name = runner.run_sync_mode(self, conn_id)

        # verify tap and target exit codes
        exit_status = menagerie.get_exit_status(conn_id, sync_job_name)
        menagerie.verify_sync_exit_status(self, exit_status, sync_job_name)

        record_count_by_stream = runner.examine_target_output_file(
            self, conn_id, self.expected_sync_streams(), self.expected_pks())
        self.assertEqual(
            record_count_by_stream, {
                'postgres_logical_replication_test_cows': 2,
                'postgres_logical_replication_test_chickens': 2
            })
        records_by_stream = runner.get_records_from_target_output()
        chicken_messages = records_by_stream[
            "postgres_logical_replication_test_chickens"]['messages']
        cow_messages = records_by_stream[
            "postgres_logical_replication_test_cows"]['messages']

        self.assertDictEqual(self.cows_rec_2, cow_messages[0]['data'])
        self.assertDictEqual(self.chicken_rec_2, chicken_messages[0]['data'])
        self.assertDictEqual(self.cows_rec_3, cow_messages[1]['data'])
        self.assertDictEqual(self.chicken_rec_3, chicken_messages[1]['data'])

        print("inserted record is correct")

        state = menagerie.get_state(conn_id)
        self.assertIsNone(state['currently_syncing'],
                          msg="expected state's currently_syncing to be None")
        cows_bookmark = state['bookmarks'][
            'dev-public-postgres_logical_replication_test_cows']
        self.assertIsNotNone(
            cows_bookmark['lsn'],
            msg=
            "expected bookmark for stream public-postgres_logical_replication_test to have an scn"
        )
        lsn_cows_2 = cows_bookmark['lsn']
        self.assertTrue(lsn_cows_2 >= lsn_cows_1)

        chickens_bookmark = state['bookmarks'][
            'dev-public-postgres_logical_replication_test_chickens']
        self.assertIsNotNone(
            chickens_bookmark['lsn'],
            msg=
            "expected bookmark for stream public-postgres_logical_replication_test to have an scn"
        )
        lsn_chickens_2 = chickens_bookmark['lsn']
        self.assertTrue(lsn_chickens_2 >= lsn_chickens_1)

        #table_version does NOT change
        self.assertEqual(
            chickens_bookmark['version'],
            table_version_chickens,
            msg=
            "expected bookmark for stream public-postgres_logical_replication_test to match version"
        )

        #table_version does NOT change
        self.assertEqual(
            cows_bookmark['version'],
            table_version_cows,
            msg=
            "expected bookmark for stream public-postgres_logical_replication_test to match version"
        )
Пример #26
0
    def test_run(self):

        conn_id = connections.ensure_connection(self)

        #  -------------------------------
        # -----------  Discovery ----------
        #  -------------------------------

        # run in discovery mode
        check_job_name = runner.run_check_mode(self, conn_id)

        # verify check  exit codes
        exit_status = menagerie.get_exit_status(conn_id, check_job_name)
        menagerie.verify_check_exit_status(self, exit_status, check_job_name)

        # verify the tap discovered the right streams
        found_catalogs = menagerie.get_catalogs(conn_id)

        # assert we find the correct streams
        self.assertEqual(self.expected_check_streams(),
                         {c['tap_stream_id']
                          for c in found_catalogs})

        for tap_stream_id in self.expected_check_streams():
            found_stream = [
                c for c in found_catalogs
                if c['tap_stream_id'] == tap_stream_id
            ][0]

            # assert that the pks are correct
            self.assertEqual(
                self.expected_pks()[found_stream['stream_name']],
                set(
                    found_stream.get('metadata',
                                     {}).get('table-key-properties')))

            # assert that the row counts are correct
            self.assertEqual(
                self.expected_row_counts()[found_stream['stream_name']],
                found_stream.get('metadata', {}).get('row-count'))

        #  -----------------------------------
        # ----------- Initial Full Table ---------
        #  -----------------------------------
        # Select simple_coll_1 and simple_coll_2 streams and add replication method metadata
        for stream_catalog in found_catalogs:
            annotated_schema = menagerie.get_annotated_schema(
                conn_id, stream_catalog['stream_id'])
            additional_md = [{
                "breadcrumb": [],
                "metadata": {
                    'replication-method': 'LOG_BASED'
                }
            }]
            selected_metadata = connections.select_catalog_and_fields_via_metadata(
                conn_id, stream_catalog, annotated_schema, additional_md)

        # Run sync
        sync_job_name = runner.run_sync_mode(self, conn_id)

        exit_status = menagerie.get_exit_status(conn_id, sync_job_name)
        menagerie.verify_sync_exit_status(self, exit_status, sync_job_name)

        # verify the persisted schema was correct
        records_by_stream = runner.get_records_from_target_output()

        # assert that each of the streams that we synced are the ones that we expect to see
        record_count_by_stream = runner.examine_target_output_file(
            self, conn_id, self.expected_sync_streams(), self.expected_pks())

        # Verify that the full table was syncd
        for tap_stream_id in self.expected_sync_streams():
            self.assertGreaterEqual(record_count_by_stream[tap_stream_id],
                                    self.expected_row_counts()[tap_stream_id])

        # Verify that we have 'initial_full_table_complete' bookmark
        state = menagerie.get_state(conn_id)
        first_versions = {}

        for tap_stream_id in self.expected_check_streams():
            # assert that the state has an initial_full_table_complete == True
            self.assertTrue(state['bookmarks'][tap_stream_id]
                            ['initial_full_table_complete'])
            # assert that there is a version bookmark in state
            first_versions[tap_stream_id] = state['bookmarks'][tap_stream_id][
                'version']
            self.assertIsNotNone(first_versions[tap_stream_id])
            # Verify that we have a oplog_ts_time and oplog_ts_inc bookmark
            self.assertIsNotNone(
                state['bookmarks'][tap_stream_id]['oplog_ts_time'])
            self.assertIsNotNone(
                state['bookmarks'][tap_stream_id]['oplog_ts_inc'])

        changed_ids = set()
        with get_test_connection() as client:
            # Delete two documents for each collection

            changed_ids.add(client['simple_db']['simple_coll_1'].find(
                {'int_field': 0})[0]['_id'])
            client["simple_db"]["simple_coll_1"].delete_one({'int_field': 0})

            changed_ids.add(client['simple_db']['simple_coll_1'].find(
                {'int_field': 1})[0]['_id'])
            client["simple_db"]["simple_coll_1"].delete_one({'int_field': 1})

            changed_ids.add(client['simple_db']['simple_coll_2'].find(
                {'int_field': 0})[0]['_id'])
            client["simple_db"]["simple_coll_2"].delete_one({'int_field': 0})

            changed_ids.add(client['simple_db']['simple_coll_2'].find(
                {'int_field': 1})[0]['_id'])
            client["simple_db"]["simple_coll_2"].delete_one({'int_field': 1})

            # Update two documents for each collection
            changed_ids.add(client['simple_db']['simple_coll_1'].find(
                {'int_field': 48})[0]['_id'])
            client["simple_db"]["simple_coll_1"].update_one(
                {'int_field': 48}, {'$set': {
                    'int_field': -1
                }})

            changed_ids.add(client['simple_db']['simple_coll_1'].find(
                {'int_field': 49})[0]['_id'])
            client["simple_db"]["simple_coll_1"].update_one(
                {'int_field': 49}, {'$set': {
                    'int_field': -1
                }})

            changed_ids.add(client['simple_db']['simple_coll_2'].find(
                {'int_field': 98})[0]['_id'])
            client["simple_db"]["simple_coll_2"].update_one(
                {'int_field': 98}, {'$set': {
                    'int_field': -1
                }})

            changed_ids.add(client['simple_db']['simple_coll_2'].find(
                {'int_field': 99})[0]['_id'])
            client["simple_db"]["simple_coll_2"].update_one(
                {'int_field': 99}, {'$set': {
                    'int_field': -1
                }})

            # Insert two documents for each collection
            client["simple_db"]["simple_coll_1"].insert_one({
                "int_field":
                50,
                "string_field":
                random_string_generator()
            })
            changed_ids.add(client['simple_db']['simple_coll_1'].find(
                {'int_field': 50})[0]['_id'])

            client["simple_db"]["simple_coll_1"].insert_one({
                "int_field":
                51,
                "string_field":
                random_string_generator()
            })
            changed_ids.add(client['simple_db']['simple_coll_1'].find(
                {'int_field': 51})[0]['_id'])

            client["simple_db"]["simple_coll_2"].insert_one({
                "int_field":
                100,
                "string_field":
                random_string_generator()
            })
            changed_ids.add(client['simple_db']['simple_coll_2'].find(
                {'int_field': 100})[0]['_id'])

            client["simple_db"]["simple_coll_2"].insert_one({
                "int_field":
                101,
                "string_field":
                random_string_generator()
            })
            changed_ids.add(client['simple_db']['simple_coll_2'].find(
                {'int_field': 101})[0]['_id'])

        #  -----------------------------------
        # ----------- Subsequent Oplog Sync ---------
        #  -----------------------------------

        # Run sync

        sync_job_name = runner.run_sync_mode(self, conn_id)

        exit_status = menagerie.get_exit_status(conn_id, sync_job_name)
        menagerie.verify_sync_exit_status(self, exit_status, sync_job_name)

        # verify the persisted schema was correct
        messages_by_stream = runner.get_records_from_target_output()
        records_by_stream = {}
        for stream_name in self.expected_sync_streams():
            records_by_stream[stream_name] = [
                x for x in messages_by_stream[stream_name]['messages']
                if x.get('action') == 'upsert'
            ]

        # assert that each of the streams that we synced are the ones that we expect to see
        record_count_by_stream = runner.examine_target_output_file(
            self, conn_id, self.expected_sync_streams(), self.expected_pks())

        # Verify that we got at least 6 records due to changes
        # (could be more due to overlap in gte oplog clause)
        for k, v in record_count_by_stream.items():
            self.assertGreaterEqual(v, 6)

        # Verify that we got 2 records with _SDC_DELETED_AT
        self.assertEqual(
            2,
            len([
                x['data'] for x in records_by_stream['simple_coll_1']
                if x['data'].get('_sdc_deleted_at')
            ]))
        self.assertEqual(
            2,
            len([
                x['data'] for x in records_by_stream['simple_coll_2']
                if x['data'].get('_sdc_deleted_at')
            ]))
        # Verify that the _id of the records sent are the same set as the
        # _ids of the documents changed
        actual = set([
            ObjectId(x['data']['_id'])
            for x in records_by_stream['simple_coll_1']
        ]).union(
            set([
                ObjectId(x['data']['_id'])
                for x in records_by_stream['simple_coll_2']
            ]))
        self.assertEqual(changed_ids, actual)
Пример #27
0
    def test_run(self):
        """
        Verify that a full sync can send capture all data and send it in the correct format
        for integer and boolean (bit) data.
        Verify that the fist sync sends an activate immediately.
        Verify that the table version is incremented up
        """
        print("running test {}".format(self.name()))

        conn_id = self.create_connection()

        # run in check mode
        check_job_name = runner.run_check_mode(self, conn_id)

        # verify check  exit codes
        exit_status = menagerie.get_exit_status(conn_id, check_job_name)
        menagerie.verify_check_exit_status(self, exit_status, check_job_name)

        # get the catalog information of discovery
        found_catalogs = menagerie.get_catalogs(conn_id)
        additional_md = [{
            "breadcrumb": [],
            "metadata": {
                'replication-method': 'FULL_TABLE'
            }
        }]
        BaseTapTest.select_all_streams_and_fields(conn_id,
                                                  found_catalogs,
                                                  additional_md=additional_md)

        # clear state
        menagerie.set_state(conn_id, {})
        sync_job_name = runner.run_sync_mode(self, conn_id)

        # verify tap and target exit codes
        exit_status = menagerie.get_exit_status(conn_id, sync_job_name)
        menagerie.verify_sync_exit_status(self, exit_status, sync_job_name)

        # verify record counts of streams
        record_count_by_stream = runner.examine_target_output_file(
            self, conn_id, self.expected_streams(),
            self.expected_primary_keys_by_stream_id())
        expected_count = {
            k: len(v['values'])
            for k, v in self.expected_metadata().items()
        }
        self.assertEqual(record_count_by_stream, expected_count)

        # verify records match on the first sync
        records_by_stream = runner.get_records_from_target_output()

        for stream in self.expected_streams():
            with self.subTest(stream=stream):
                stream_expected_data = self.expected_metadata()[stream]
                # TODO - test schema matches expectations based on data type, nullable, not nullable, datetimes as string +, etc
                #   This needs to be consistent based on replication method so you can change replication methods
                table_version = records_by_stream[stream]['table_version']

                # verify on the first sync you get activate version message before and after all data
                self.assertEqual(
                    records_by_stream[stream]['messages'][0]['action'],
                    'activate_version')
                self.assertEqual(
                    records_by_stream[stream]['messages'][-1]['action'],
                    'activate_version')
                column_names = [
                    list(field_data.keys())[0]
                    for field_data in stream_expected_data[self.FIELDS]
                ]

                expected_messages = [{
                    "action": "upsert",
                    "data": {
                        column: value
                        for column, value in list(
                            zip(column_names, stream_expected_data[self.VALUES]
                                [row]))
                    }
                } for row in range(len(stream_expected_data[self.VALUES]))]

                # remove sequences from actual values for comparison
                [
                    message.pop("sequence")
                    for message in records_by_stream[stream]['messages'][1:-1]
                ]

                # Verify all data is correct
                for expected_row, actual_row in list(
                        zip(expected_messages,
                            records_by_stream[stream]['messages'][1:-1])):
                    with self.subTest(expected_row=expected_row):
                        self.assertEqual(actual_row["action"], "upsert")
                        self.assertEqual(
                            len(expected_row["data"].keys()),
                            len(actual_row["data"].keys()),
                            msg="there are not the same number of columns")

                        for column_name, expected_value in expected_row[
                                "data"].items():
                            self.assertEqual(
                                expected_value,
                                actual_row["data"][column_name],
                                msg="expected: {} != actual {}".format(
                                    expected_row, actual_row))
                print("records are correct for stream {}".format(stream))

                # verify state and bookmarks
                state = menagerie.get_state(conn_id)
                bookmark = state['bookmarks'][stream]

                self.assertIsNone(
                    state.get('currently_syncing'),
                    msg="expected state's currently_syncing to be None")
                # TODO - change this to something for mssql once binlog (cdc) is finalized and we know what it is
                self.assertIsNone(
                    bookmark.get('lsn'),
                    msg=
                    "expected bookmark for stream to have NO lsn because we are using full-table replication"
                )

                self.assertEqual(
                    bookmark['version'],
                    table_version,
                    msg="expected bookmark for stream to match version")

                expected_schemas = self.expected_metadata()[stream]['schema']
                self.assertEqual(records_by_stream[stream]['schema'],
                                 expected_schemas,
                                 msg="expected: {} != actual: {}".format(
                                     expected_schemas,
                                     records_by_stream[stream]['schema']))
Пример #28
0
    def binlog_json_test(self):
        print("RUNNING {}\n\n".format(self.name()))

        conn_id = connections.ensure_connection(self)

        # run in check mode
        check_job_name = runner.run_check_mode(self, conn_id)

        # verify check exit codes
        exit_status = menagerie.get_exit_status(conn_id, check_job_name)
        menagerie.verify_check_exit_status(self, exit_status, check_job_name)

        expected_check_streams = {self.tap_stream_id()}
        expected_sync_streams = {self.table_name()}
        expected_pks = {self.table_name(): {'id'}}

        # verify the tap discovered the right streams
        found_catalogs = [
            catalog for catalog in menagerie.get_catalogs(conn_id)
            if catalog['tap_stream_id'] in expected_check_streams
        ]

        self.assertGreaterEqual(
            len(found_catalogs),
            1,
            msg="unable to locate schemas for connection {}".format(conn_id))

        found_catalog_names = set(
            map(lambda c: c['tap_stream_id'], found_catalogs))
        diff = expected_check_streams.symmetric_difference(found_catalog_names)
        self.assertEqual(
            len(diff),
            0,
            msg="discovered schemas do not match: {}".format(diff))

        # verify that persisted streams have the correct properties
        test_catalog = found_catalogs[0]

        self.assertEqual(self.table_name(), test_catalog['stream_name'])

        print("discovered streams are correct")

        additional_md = [{
            "breadcrumb": [],
            "metadata": {
                'replication-method': 'LOG_BASED'
            }
        }]
        selected_metadata = connections.select_catalog_and_fields_via_metadata(
            conn_id, test_catalog,
            menagerie.get_annotated_schema(conn_id, test_catalog['stream_id']),
            additional_md)

        # clear state
        menagerie.set_state(conn_id, {})

        # run initial full table sync
        sync_job_name = runner.run_sync_mode(self, conn_id)

        exit_status = menagerie.get_exit_status(conn_id, sync_job_name)
        menagerie.verify_sync_exit_status(self, exit_status, sync_job_name)

        # verify the persisted schema was correct
        records_by_stream = runner.get_records_from_target_output()

        self.maxDiff = None
        for stream, recs in records_by_stream.items():
            self.assertEqual(
                recs['schema'],
                expected_schemas[stream],
                msg=
                "Persisted schema did not match expected schema for stream `{}`."
                .format(stream))

        record_count_by_stream = runner.examine_target_output_file(
            self, conn_id, expected_sync_streams, expected_pks)

        self.assertEqual(record_count_by_stream, {self.table_name(): 1})
        records_for_stream = runner.get_records_from_target_output()[
            self.table_name()]
        messages_for_stream = records_for_stream['messages']
        message_actions = [rec['action'] for rec in messages_for_stream]

        self.assertEqual(message_actions,
                         ['activate_version', 'upsert', 'activate_version'])

        # ensure some log_file and log_pos state was persisted
        state = menagerie.get_state(conn_id)
        bookmark = state['bookmarks'][self.tap_stream_id()]

        self.assertIsNotNone(bookmark['log_file'])
        self.assertIsNotNone(bookmark['log_pos'])

        expected_log_file = bookmark['log_file']
        expected_log_pos = bookmark['log_pos']

        # grab version, log_file and log_pos from state to check later
        expected_table_version = records_for_stream['table_version']

        self.assertEqual(expected_table_version, bookmark['version'])

        # check for expected records
        upsert_records = [
            m['data'] for m in messages_for_stream if m['action'] == 'upsert'
        ]

        self.assertEqual([expected_rec_1], upsert_records)

        # run binlog sync
        sync_job_name = runner.run_sync_mode(self, conn_id)

        exit_status = menagerie.get_exit_status(conn_id, sync_job_name)
        menagerie.verify_sync_exit_status(self, exit_status, sync_job_name)

        # check that version
        state = menagerie.get_state(conn_id)
        bookmark = state['bookmarks'][self.tap_stream_id()]

        self.assertEqual(expected_table_version, bookmark['version'])

        # verify the persisted schema was correct
        records_by_stream = runner.get_records_from_target_output()

        for stream, recs in records_by_stream.items():
            self.assertEqual(
                recs['schema'],
                expected_schemas[stream],
                msg=
                "Persisted schema did not match expected schema for stream `{}`."
                .format(stream))

        # record count should be empty as we did not persist anything to the gate
        record_count_by_stream = runner.examine_target_output_file(
            self, conn_id, expected_sync_streams, expected_pks)

        self.assertEqual(record_count_by_stream, {})

        # insert a new huge row
        data = dict([('foooo%i' % i, 'baaaaar%i' % i) for i in range(2560)],
                    literal=True)
        rec = {'id': 2, 'our_json': json.dumps(data)}

        with db_utils.get_db_connection(
                self.get_properties(), self.get_credentials()).cursor() as cur:
            self.insert_record(cur, rec)

        # run binlog sync
        sync_job_name = runner.run_sync_mode(self, conn_id)

        # verify tap and target exit codes
        exit_status = menagerie.get_exit_status(conn_id, sync_job_name)
        menagerie.verify_sync_exit_status(self, exit_status, sync_job_name)

        # check that version from state is unchanged
        state = menagerie.get_state(conn_id)
        bookmark = state['bookmarks'][self.tap_stream_id()]

        self.assertEqual(expected_table_version, bookmark['version'])

        # Either the log_file is the same but the log_pos has increased or the log_file
        # has rotated and the numeric suffix has increased
        if expected_log_file == bookmark['log_file']:
            self.assertGreater(bookmark['log_pos'], expected_log_pos)
        else:
            expected_log_file_suffix = re.search('^.*\.(\d+)$',
                                                 expected_log_file).groups()[0]
            updated_log_file_suffix = re.search(
                '^.*\.(\d+)$', bookmark['log_file']).groups()[0]

            self.assertGreater(int(updated_log_file_suffix),
                               int(expected_log_file_suffix))

        expected_log_file = bookmark['log_file']
        expected_log_pos = bookmark['log_pos']

        expected_rec_2 = copy.deepcopy(rec)

        # check for expected records
        records_for_stream = runner.get_records_from_target_output()[
            self.table_name()]
        messages_for_stream = records_for_stream['messages']
        message_actions = [rec['action'] for rec in messages_for_stream]

        self.assertEqual(message_actions, ['upsert'])

        upsert_records = [
            m['data'] for m in messages_for_stream if m['action'] == 'upsert'
        ]
        del upsert_records[0]['_sdc_deleted_at']

        expected_json = json.loads(expected_rec_2.get('our_json', {}))
        actual_json = json.loads(upsert_records[0].get('our_json', {}))

        self.assertTrue(len(actual_json.keys()) > 0)
        self.assertEqual(expected_json, actual_json)
Пример #29
0
    def test_run(self):
        conn_id = connections.ensure_connection(self)

        # run in check mode
        check_job_name = runner.run_check_mode(self, conn_id)

        # verify check  exit codes
        exit_status = menagerie.get_exit_status(conn_id, check_job_name)
        menagerie.verify_check_exit_status(self, exit_status, check_job_name)

        # verify discovery produced (at least) 1 expected catalog
        found_catalogs = [
            found_catalog for found_catalog in menagerie.get_catalogs(conn_id)
            if found_catalog['tap_stream_id'] in self.expected_check_streams()
        ]
        self.assertGreaterEqual(len(found_catalogs), 1)

        # verify the tap discovered the expected streams
        found_catalog_names = {
            catalog['tap_stream_id']
            for catalog in found_catalogs
        }
        self.assertSetEqual(self.expected_check_streams(), found_catalog_names)

        # verify that persisted streams have the correct properties
        test_catalog = found_catalogs[0]
        self.assertEqual(test_table_name, test_catalog['stream_name'])
        print("discovered streams are correct")

        # perform table selection
        print('selecting {} and all fields within the table'.format(
            test_table_name))
        schema_and_metadata = menagerie.get_annotated_schema(
            conn_id, test_catalog['stream_id'])
        additional_md = [{
            "breadcrumb": [],
            "metadata": {
                'replication-method': 'FULL_TABLE'
            }
        }]
        _ = connections.select_catalog_and_fields_via_metadata(
            conn_id, test_catalog, schema_and_metadata, additional_md)

        # clear state
        menagerie.set_state(conn_id, {})

        # run sync job 1 and verify exit codes
        sync_job_name = runner.run_sync_mode(self, conn_id)
        exit_status = menagerie.get_exit_status(conn_id, sync_job_name)
        menagerie.verify_sync_exit_status(self, exit_status, sync_job_name)

        # get records
        record_count_by_stream = runner.examine_target_output_file(
            self, conn_id, self.expected_sync_streams(), self.expected_pks())
        records_by_stream = runner.get_records_from_target_output()
        table_version_1 = records_by_stream[test_table_name]['table_version']
        messages = records_by_stream[test_table_name]['messages']

        # verify the execpted number of records were replicated
        self.assertEqual(3, record_count_by_stream[test_table_name])

        # verify the message actions match expectations
        self.assertEqual(5, len(messages))
        self.assertEqual('activate_version', messages[0]['action'])
        self.assertEqual('upsert', messages[1]['action'])
        self.assertEqual('upsert', messages[2]['action'])
        self.assertEqual('upsert', messages[3]['action'])
        self.assertEqual('activate_version', messages[4]['action'])

        # verify the persisted schema matches expectations
        self.assertEqual(expected_schemas[test_table_name],
                         records_by_stream[test_table_name]['schema'])

        # verify replicated records match expectations
        self.assertDictEqual(self.expected_records[0], messages[1]['data'])
        self.assertDictEqual(self.expected_records[1], messages[2]['data'])
        self.assertDictEqual(self.expected_records[2], messages[3]['data'])

        print("records are correct")

        # grab bookmarked state
        state = menagerie.get_state(conn_id)
        bookmark = state['bookmarks'][
            'dev-public-postgres_full_table_replication_test']

        # verify state and bookmarks meet expectations
        self.assertIsNone(state['currently_syncing'])
        self.assertIsNone(bookmark.get('lsn'))
        self.assertIsNone(bookmark.get('replication_key'))
        self.assertIsNone(bookmark.get('replication_key_value'))
        self.assertEqual(table_version_1, bookmark['version'])

        #----------------------------------------------------------------------
        # invoke the sync job AGAIN and get the same 3 records
        #----------------------------------------------------------------------

        # run sync job 2 and verify exit codes
        sync_job_name = runner.run_sync_mode(self, conn_id)
        exit_status = menagerie.get_exit_status(conn_id, sync_job_name)
        menagerie.verify_sync_exit_status(self, exit_status, sync_job_name)

        # get records
        record_count_by_stream = runner.examine_target_output_file(
            self, conn_id, self.expected_sync_streams(), self.expected_pks())
        records_by_stream = runner.get_records_from_target_output()
        table_version_2 = records_by_stream[test_table_name]['table_version']
        messages = records_by_stream[test_table_name]['messages']

        # verify the execpted number of records were replicated
        self.assertEqual(3, record_count_by_stream[test_table_name])

        # verify the message actions match expectations
        self.assertEqual(4, len(messages))
        self.assertEqual('upsert', messages[0]['action'])
        self.assertEqual('upsert', messages[1]['action'])
        self.assertEqual('upsert', messages[2]['action'])
        self.assertEqual('activate_version', messages[3]['action'])

        # verify the new table version increased on the second sync
        self.assertGreater(table_version_2, table_version_1)

        # verify the persisted schema still matches expectations
        self.assertEqual(expected_schemas[test_table_name],
                         records_by_stream[test_table_name]['schema'])

        # verify replicated records still match expectations
        self.assertDictEqual(self.expected_records[0], messages[0]['data'])
        self.assertDictEqual(self.expected_records[1], messages[1]['data'])
        self.assertDictEqual(self.expected_records[2], messages[2]['data'])

        # grab bookmarked state
        state = menagerie.get_state(conn_id)
        bookmark = state['bookmarks'][
            'dev-public-postgres_full_table_replication_test']

        # verify state and bookmarks meet expectations
        self.assertIsNone(state['currently_syncing'])
        self.assertIsNone(bookmark.get('lsn'))
        self.assertIsNone(bookmark.get('replication_key'))
        self.assertIsNone(bookmark.get('replication_key_value'))
        self.assertEqual(table_version_2, bookmark['version'])

        #----------------------------------------------------------------------
        # invoke the sync job AGAIN following various manipulations to the data
        #----------------------------------------------------------------------

        with db_utils.get_test_connection('dev') as conn:
            conn.autocommit = True
            with conn.cursor(cursor_factory=psycopg2.extras.DictCursor) as cur:

                # NB | We will perform the following actions prior to the next sync:
                #      [Action (EXPECTED RESULT)]

                #      Insert a record
                #      Insert a record to be updated prior to sync
                #      Insert a record to be deleted prior to sync (NOT REPLICATED)

                #      Update an existing record
                #      Update a newly inserted record

                #      Delete an existing record
                #      Delete  a newly inserted record

                # inserting...
                # a new record
                nyc_tz = pytz.timezone('America/New_York')
                our_time_offset = "-04:00"
                our_ts = datetime.datetime(1996, 4, 4, 4, 4, 4, 733184)
                our_ts_tz = nyc_tz.localize(our_ts)
                our_time = datetime.time(6, 6, 6)
                our_time_tz = our_time.isoformat() + our_time_offset
                our_date = datetime.date(1970, 7, 1)
                my_uuid = str(uuid.uuid1())
                self.inserted_records.append({
                    'our_varchar':
                    "our_varchar 2",
                    'our_varchar_10':
                    "varchar_10",
                    'our_text':
                    "some text 2",
                    'our_integer':
                    44101,
                    'our_smallint':
                    2,
                    'our_bigint':
                    1000001,
                    'our_decimal':
                    decimal.Decimal('9876543210.02'),
                    quote_ident('OUR TS', cur):
                    our_ts,
                    quote_ident('OUR TS TZ', cur):
                    our_ts_tz,
                    quote_ident('OUR TIME', cur):
                    our_time,
                    quote_ident('OUR TIME TZ', cur):
                    our_time_tz,
                    quote_ident('OUR DATE', cur):
                    our_date,
                    'our_double':
                    decimal.Decimal('1.1'),
                    'our_real':
                    decimal.Decimal('1.2'),
                    'our_boolean':
                    True,
                    'our_bit':
                    '1',
                    'our_json':
                    json.dumps({'nymn': 77}),
                    'our_jsonb':
                    json.dumps({'burgers': 'good++'}),
                    'our_uuid':
                    my_uuid,
                    'our_citext':
                    'cyclops 2',
                    'our_store':
                    'dances=>"floor",name=>"betty"',
                    'our_cidr':
                    '192.168.101.128/25',
                    'our_inet':
                    '192.168.101.128/24',
                    'our_mac':
                    '08:00:2b:01:02:04',
                    'our_money':
                    '$0.98789'
                })
                self.expected_records.append({
                    'id':
                    4,
                    'our_varchar':
                    "our_varchar 2",
                    'our_varchar_10':
                    "varchar_10",
                    'our_text':
                    "some text 2",
                    'our_integer':
                    44101,
                    'our_smallint':
                    2,
                    'our_bigint':
                    1000001,
                    'our_decimal':
                    decimal.Decimal('9876543210.02'),
                    'OUR TS':
                    self.expected_ts(our_ts),
                    'OUR TS TZ':
                    self.expected_ts_tz(our_ts_tz),
                    'OUR TIME':
                    str(our_time),
                    'OUR TIME TZ':
                    str(our_time_tz),
                    'OUR DATE':
                    '1970-07-01T00:00:00+00:00',
                    'our_double':
                    decimal.Decimal('1.1'),
                    'our_real':
                    decimal.Decimal('1.2'),
                    'our_boolean':
                    True,
                    'our_bit':
                    True,
                    'our_json':
                    '{"nymn": 77}',
                    'our_jsonb':
                    '{"burgers": "good++"}',
                    'our_uuid':
                    self.inserted_records[-1]['our_uuid'],
                    'our_citext':
                    self.inserted_records[-1]['our_citext'],
                    'our_store': {
                        "name": "betty",
                        "dances": "floor"
                    },
                    'our_cidr':
                    self.inserted_records[-1]['our_cidr'],
                    'our_inet':
                    self.inserted_records[-1]['our_inet'],
                    'our_mac':
                    self.inserted_records[-1]['our_mac'],
                    'our_money':
                    '$0.99',
                    'our_alignment_enum':
                    None,
                })
                # a new record which we will then update prior to sync
                our_ts = datetime.datetime(2007, 1, 1, 12, 12, 12, 222111)
                nyc_tz = pytz.timezone('America/New_York')
                our_ts_tz = nyc_tz.localize(our_ts)
                our_time = datetime.time(12, 11, 10)
                our_time_tz = our_time.isoformat() + "-04:00"
                our_date = datetime.date(1999, 9, 9)
                my_uuid = str(uuid.uuid1())
                self.inserted_records.append({
                    'our_varchar':
                    "our_varchar 4",
                    'our_varchar_10':
                    "varchar_3",
                    'our_text':
                    "some text 4",
                    'our_integer':
                    55200,
                    'our_smallint':
                    1,
                    'our_bigint':
                    100000,
                    'our_decimal':
                    decimal.Decimal('1234567899.99'),
                    quote_ident('OUR TS', cur):
                    our_ts,
                    quote_ident('OUR TS TZ', cur):
                    our_ts_tz,
                    quote_ident('OUR TIME', cur):
                    our_time,
                    quote_ident('OUR TIME TZ', cur):
                    our_time_tz,
                    quote_ident('OUR DATE', cur):
                    our_date,
                    'our_double':
                    decimal.Decimal('1.1'),
                    'our_real':
                    decimal.Decimal('1.2'),
                    'our_boolean':
                    True,
                    'our_bit':
                    '0',
                    'our_json':
                    json.dumps('some string'),
                    'our_jsonb':
                    json.dumps(['burgers are good']),
                    'our_uuid':
                    my_uuid,
                    'our_store':
                    'size=>"small",name=>"betty"',
                    'our_citext':
                    'cyclops 3',
                    'our_cidr':
                    '192.168.101.128/25',
                    'our_inet':
                    '192.168.101.128/24',
                    'our_mac':
                    '08:00:2b:01:02:04',
                    'our_money':
                    None,
                })
                self.expected_records.append({
                    'our_decimal':
                    decimal.Decimal('1234567899.99'),
                    'our_text':
                    'some text 4',
                    'our_bit':
                    False,
                    'our_integer':
                    55200,
                    'our_double':
                    decimal.Decimal('1.1'),
                    'id':
                    5,
                    'our_json':
                    self.inserted_records[-1]['our_json'],
                    'our_boolean':
                    True,
                    'our_jsonb':
                    self.inserted_records[-1]['our_jsonb'],
                    'our_bigint':
                    100000,
                    'OUR TS':
                    self.expected_ts(our_ts),
                    'OUR TS TZ':
                    self.expected_ts_tz(our_ts_tz),
                    'OUR TIME':
                    str(our_time),
                    'OUR TIME TZ':
                    str(our_time_tz),
                    'our_store': {
                        "name": "betty",
                        "size": "small"
                    },
                    'our_smallint':
                    1,
                    'OUR DATE':
                    '1999-09-09T00:00:00+00:00',
                    'our_varchar':
                    'our_varchar 4',
                    'our_uuid':
                    self.inserted_records[-1]['our_uuid'],
                    'our_real':
                    decimal.Decimal('1.2'),
                    'our_varchar_10':
                    'varchar_3',
                    'our_citext':
                    'cyclops 3',
                    'our_cidr':
                    '192.168.101.128/25',
                    'our_inet':
                    '192.168.101.128/24',
                    'our_mac':
                    '08:00:2b:01:02:04',
                    'our_money':
                    None,
                    'our_alignment_enum':
                    None,
                })
                # a new record to be deleted prior to sync
                our_ts = datetime.datetime(2111, 1, 1, 12, 12, 12, 222111)
                nyc_tz = pytz.timezone('America/New_York')
                our_ts_tz = nyc_tz.localize(our_ts)
                our_time = datetime.time(12, 11, 10)
                our_time_tz = our_time.isoformat() + "-04:00"
                our_date = datetime.date(1999, 9, 9)
                my_uuid = str(uuid.uuid1())
                self.inserted_records.append({
                    'our_varchar':
                    "our_varchar 4",
                    'our_varchar_10':
                    "varchar_3",
                    'our_text':
                    "some text 4",
                    'our_integer':
                    55200,
                    'our_smallint':
                    1,
                    'our_bigint':
                    100000,
                    'our_decimal':
                    decimal.Decimal('1234567899.99'),
                    quote_ident('OUR TS', cur):
                    our_ts,
                    quote_ident('OUR TS TZ', cur):
                    our_ts_tz,
                    quote_ident('OUR TIME', cur):
                    our_time,
                    quote_ident('OUR TIME TZ', cur):
                    our_time_tz,
                    quote_ident('OUR DATE', cur):
                    our_date,
                    'our_double':
                    decimal.Decimal('1.1'),
                    'our_real':
                    decimal.Decimal('1.2'),
                    'our_boolean':
                    True,
                    'our_bit':
                    '0',
                    'our_json':
                    json.dumps('some string'),
                    'our_jsonb':
                    json.dumps(['burgers are good']),
                    'our_uuid':
                    my_uuid,
                    'our_store':
                    'size=>"small",name=>"betty"',
                    'our_citext':
                    'cyclops 3',
                    'our_cidr':
                    '192.168.101.128/25',
                    'our_inet':
                    '192.168.101.128/24',
                    'our_mac':
                    '08:00:2b:01:02:04',
                    'our_money':
                    None,
                })
                self.expected_records.append({
                    'our_decimal':
                    decimal.Decimal('1234567899.99'),
                    'our_text':
                    'some text 4',
                    'our_bit':
                    False,
                    'our_integer':
                    55200,
                    'our_double':
                    decimal.Decimal('1.1'),
                    'id':
                    6,
                    'our_json':
                    self.inserted_records[-1]['our_json'],
                    'our_boolean':
                    True,
                    'our_jsonb':
                    self.inserted_records[-1]['our_jsonb'],
                    'our_bigint':
                    100000,
                    'OUR TS':
                    self.expected_ts(our_ts),
                    'OUR TS TZ':
                    self.expected_ts_tz(our_ts_tz),
                    'OUR TIME':
                    str(our_time),
                    'OUR TIME TZ':
                    str(our_time_tz),
                    'our_store': {
                        "name": "betty",
                        "size": "small"
                    },
                    'our_smallint':
                    1,
                    'OUR DATE':
                    '1999-09-09T00:00:00+00:00',
                    'our_varchar':
                    'our_varchar 4',
                    'our_uuid':
                    self.inserted_records[-1]['our_uuid'],
                    'our_real':
                    decimal.Decimal('1.2'),
                    'our_varchar_10':
                    'varchar_3',
                    'our_citext':
                    'cyclops 3',
                    'our_cidr':
                    '192.168.101.128/25',
                    'our_inet':
                    '192.168.101.128/24',
                    'our_mac':
                    '08:00:2b:01:02:04',
                    'our_money':
                    None,
                    'our_alignment_enum':
                    None,
                })

                db_utils.insert_record(cur, test_table_name,
                                       self.inserted_records[3])
                db_utils.insert_record(cur, test_table_name,
                                       self.inserted_records[4])
                db_utils.insert_record(cur, test_table_name,
                                       self.inserted_records[5])

                # updating ...
                # an existing record
                canon_table_name = db_utils.canonicalized_table_name(
                    cur, test_schema_name, test_table_name)
                record_pk = 1
                our_ts = datetime.datetime(2021, 4, 4, 4, 4, 4, 733184)
                our_ts_tz = nyc_tz.localize(our_ts)
                updated_data = {
                    "OUR TS TZ": our_ts_tz,
                    "our_double": decimal.Decimal("6.6"),
                    "our_money": "$0.00"
                }
                self.expected_records[0]["OUR TS TZ"] = self.expected_ts_tz(
                    our_ts_tz)
                self.expected_records[0]["our_double"] = decimal.Decimal("6.6")
                self.expected_records[0]["our_money"] = "$0.00"

                db_utils.update_record(cur, canon_table_name, record_pk,
                                       updated_data)

                # a newly inserted record
                canon_table_name = db_utils.canonicalized_table_name(
                    cur, test_schema_name, test_table_name)
                record_pk = 5
                our_ts = datetime.datetime(2021, 4, 4, 4, 4, 4, 733184)
                our_ts_tz = nyc_tz.localize(our_ts)
                updated_data = {
                    "OUR TS TZ": our_ts_tz,
                    "our_double": decimal.Decimal("6.6"),
                    "our_money": "$0.00"
                }
                self.expected_records[4]["OUR TS TZ"] = self.expected_ts_tz(
                    our_ts_tz)
                self.expected_records[4]["our_double"] = decimal.Decimal("6.6")
                self.expected_records[4]["our_money"] = "$0.00"

                db_utils.update_record(cur, canon_table_name, record_pk,
                                       updated_data)

                # deleting
                # an existing record
                record_pk = 2
                db_utils.delete_record(cur, canon_table_name, record_pk)

                # a newly inserted record
                record_pk = 6
                db_utils.delete_record(cur, canon_table_name, record_pk)

        #----------------------------------------------------------------------
        # invoke the sync job AGAIN after vairous manipulations
        #----------------------------------------------------------------------

        # run sync job 3 and verify exit codes
        sync_job_name = runner.run_sync_mode(self, conn_id)
        exit_status = menagerie.get_exit_status(conn_id, sync_job_name)
        menagerie.verify_sync_exit_status(self, exit_status, sync_job_name)

        # get records
        record_count_by_stream = runner.examine_target_output_file(
            self, conn_id, self.expected_sync_streams(), self.expected_pks())
        records_by_stream = runner.get_records_from_target_output()
        table_version_3 = records_by_stream[test_table_name]['table_version']
        messages = records_by_stream[test_table_name]['messages']

        # verify the execpted number of records were replicated
        self.assertEqual(4, record_count_by_stream[test_table_name])

        # verify the message actions match expectations
        self.assertEqual(5, len(messages))
        self.assertEqual('upsert', messages[0]['action'])
        self.assertEqual('upsert', messages[1]['action'])
        self.assertEqual('upsert', messages[2]['action'])
        self.assertEqual('upsert', messages[3]['action'])
        self.assertEqual('activate_version', messages[4]['action'])

        # verify the new table version increased on the second sync
        self.assertGreater(table_version_3, table_version_2)

        # verify the persisted schema still matches expectations
        self.assertEqual(expected_schemas[test_table_name],
                         records_by_stream[test_table_name]['schema'])

        # NB | This is a little tough to track mentally so here's a breakdown of
        #      the order of operations by expected records indexes:

        #      Prior to Sync 1
        #        insert 0, 1, 2

        #      Prior to Sync 2
        #        No db changes

        #      Prior to Sync 3
        #        insert 3, 4, 5
        #        update 0, 4
        #        delete 1, 5

        #      Resulting Synced Records: 2, 3, 0, 4

        # verify replicated records still match expectations
        self.assertDictEqual(self.expected_records[2],
                             messages[0]['data'])  # existing insert
        self.assertDictEqual(self.expected_records[3],
                             messages[1]['data'])  # new insert
        self.assertDictEqual(self.expected_records[0],
                             messages[2]['data'])  # existing update
        self.assertDictEqual(self.expected_records[4],
                             messages[3]['data'])  # new insert / update

        # grab bookmarked state
        state = menagerie.get_state(conn_id)
        bookmark = state['bookmarks'][
            'dev-public-postgres_full_table_replication_test']

        # verify state and bookmarks meet expectations
        self.assertIsNone(state['currently_syncing'])
        self.assertIsNone(bookmark.get('lsn'))
        self.assertIsNone(bookmark.get('replication_key'))
        self.assertIsNone(bookmark.get('replication_key_value'))
        self.assertEqual(table_version_3, bookmark['version'])
Пример #30
0
    def binlog_edge_test(self, expected_records=[]):
        """
        Test binlog replication edge cases
        • Verify an initial sync returns expected records of various datatypes
        • Verify we bookmark correctly when a transaction spans multiple files
        • Insert and delete a record prior to sync. Verify both events are replicated
        • Insert and update a record prior to sync. Verify both events are replicated
        • Verify a valid log_file and log_pos state are persisted after each sync
        """

        conn_id = connections.ensure_connection(self)

        # prior to first sync update a record...
        updated_timestamp = datetime.datetime.now()
        updated_id = 1
        expected_records[1]['our_timestamp_2'] = datetime.datetime.strftime(
            updated_timestamp, "%Y-%m-%dT%H:%M:%S.%fZ")

        # insert a record and...
        inserted_record = self.generate_record_n(len(expected_records))
        expected_records += [inserted_record]  # TODO need to format

        # delete a record
        deleted_id = 2

        with db_utils.get_db_connection(
                self.get_properties(), self.get_credentials()).cursor() as cur:
            cur.execute(
                "UPDATE {}.{} SET our_timestamp_2 = '{}' WHERE id = {}".format(
                    self.database_name(), self.table_name_1(),
                    updated_timestamp, updated_id))

            self.insert_record(cur, inserted_record, self.table_name_1())

            delete_time = datetime.datetime.now()
            cur.execute("DELETE FROM {}.{} WHERE id = {}".format(
                self.database_name(), self.table_name_1(), deleted_id))

        print(
            "\n\nMySQL DB Actions." + \
            "\nNAME: {}\nTABLE: {}".format(self.database_name(), self.table_name_1()) + \
            "\nEVENTS: {} records updated".format(1) + \
            "\n        {} records deleted\n\n".format(1)
        )

        # run in check mode
        check_job_name = runner.run_check_mode(self, conn_id)

        # verify check exit codes
        exit_status = menagerie.get_exit_status(conn_id, check_job_name)
        menagerie.verify_check_exit_status(self, exit_status, check_job_name)

        t1 = self.table_name_1()
        t2 = self.table_name_2()
        expected_check_streams = {
            self.tap_stream_id(t1),
            self.tap_stream_id(t2)
        }
        expected_sync_streams = {t1, t2}
        expected_pks = {t1: {'id'}, t2: {'id'}}

        # verify the tap discovered the right streams
        found_catalogs = [
            catalog for catalog in menagerie.get_catalogs(conn_id)
            if catalog['tap_stream_id'] in expected_check_streams
        ]

        self.assertGreaterEqual(
            len(found_catalogs),
            1,
            msg="unable to locate schemas for connection {}".format(conn_id))

        found_catalog_names = set(
            map(lambda c: c['tap_stream_id'], found_catalogs))
        diff = expected_check_streams.symmetric_difference(found_catalog_names)
        self.assertEqual(
            len(diff),
            0,
            msg="discovered schemas do not match: {}".format(diff))

        # verify that persisted streams have the correct properties
        self.assertEqual(self.table_name_1(), found_catalogs[0]['stream_name'])
        self.assertEqual(self.table_name_2(), found_catalogs[1]['stream_name'])
        print("discovered streams are correct")

        additional_md = [{
            "breadcrumb": [],
            "metadata": {
                'replication-method': 'LOG_BASED'
            }
        }]
        for catalog in found_catalogs:
            schema = menagerie.get_annotated_schema(conn_id,
                                                    catalog['stream_id'])
            _ = connections.select_catalog_and_fields_via_metadata(
                conn_id, catalog, catalog, additional_md)

        # clear state
        menagerie.set_state(conn_id, {})

        # run initial full table sync
        sync_job_name = runner.run_sync_mode(self, conn_id)

        exit_status = menagerie.get_exit_status(conn_id, sync_job_name)
        menagerie.verify_sync_exit_status(self, exit_status, sync_job_name)

        # verify the persisted schema was correct
        records_by_stream = runner.get_records_from_target_output()
        self.maxDiff = None
        for stream, recs in records_by_stream.items():
            self.assertEqual(
                recs['schema'],
                expected_schemas[stream],
                msg=
                "Persisted schema did not match expected schema for stream `{}`."
                .format(stream))

        record_count_by_stream = runner.examine_target_output_file(
            self, conn_id, expected_sync_streams, expected_pks)

        # BUG missing deleted record | https://stitchdata.atlassian.net/browse/SRCE-4258
        # self.assertEqual({self.table_name_1(): len(expected_records)}, record_count_by_stream)
        records_for_stream = runner.get_records_from_target_output()[
            self.table_name_1()]
        messages_for_stream = records_for_stream['messages']
        message_actions = [rec['action'] for rec in messages_for_stream]

        # verify activate version messages are present
        self.assertEqual('activate_version', message_actions[0])
        self.assertEqual('activate_version', message_actions[-1])

        # ensure some log_file and log_pos state was persisted
        state = menagerie.get_state(conn_id)
        bookmark = state['bookmarks'][self.tap_stream_id(t1)]

        self.assertIsNotNone(bookmark['log_file'])
        self.assertIsNotNone(bookmark['log_pos'])

        expected_log_file = bookmark['log_file']
        expected_log_pos = bookmark['log_pos']

        # grab version, log_file and log_pos from state to check later
        expected_table_version = records_for_stream['table_version']

        self.assertEqual(expected_table_version, bookmark['version'])

        # check for expected records
        upsert_records = [
            m['data'] for m in messages_for_stream if m['action'] == 'upsert'
        ]
        # we need to compare record by record since there are so many.
        # a failure comparing expected_records to upsert_records would result in
        # an output message greater in length than a standard tmux buffer
        # BUG missing datetime precision | https://stitchdata.atlassian.net/browse/SRCE-4257
        # for expected_record in expected_records:
        #     upsert_record = [rec for rec in upsert_records
        #                      if rec['id'] == expected_record['id']]
        #     self.assertEqual(1, len(upsert_record),
        #                      msg="multiple upsert_recs with same pk: {}".format(upsert_record))
        #     self.assertEqual(expected_record, upsert_record.pop())

        # TODO add check for _sdc_delete_at for deleted record once bug addressed

        # run binlog sync
        sync_job_name = runner.run_sync_mode(self, conn_id)

        exit_status = menagerie.get_exit_status(conn_id, sync_job_name)
        menagerie.verify_sync_exit_status(self, exit_status, sync_job_name)

        # check that version
        state = menagerie.get_state(conn_id)
        bookmark = state['bookmarks'][self.tap_stream_id(t1)]

        self.assertEqual(expected_table_version, bookmark['version'])

        # verify the persisted schema was correct
        records_by_stream = runner.get_records_from_target_output()
        for stream, recs in records_by_stream.items():
            self.assertEqual(
                recs['schema'],
                expected_schemas[stream],
                msg=
                "Persisted schema did not match expected schema for stream `{}`."
                .format(stream))

        # record count should be empty as we did not persist anything to the gate
        record_count_by_stream = runner.examine_target_output_file(
            self, conn_id, expected_sync_streams, expected_pks)
        self.assertEqual(record_count_by_stream, {})

        # Create 1 more record prior to 2nd sync
        new_record = self.generate_record_n(len(expected_records))
        with db_utils.get_db_connection(
                self.get_properties(), self.get_credentials()).cursor() as cur:
            self.insert_record(cur, new_record, self.table_name_1())
        print(
            "\n\nMySQL DB Actions." + \
            "\nNAME: {}\nTABLE: {}".format(self.database_name(), self.table_name_1()) + \
            "\nEVENTS: {} records inserted".format(1)
        )

        # run binlog sync
        sync_job_name = runner.run_sync_mode(self, conn_id)

        # verify tap and target exit codes
        exit_status = menagerie.get_exit_status(conn_id, sync_job_name)
        menagerie.verify_sync_exit_status(self, exit_status, sync_job_name)

        # check that version from state is unchanged
        state = menagerie.get_state(conn_id)
        bookmark = state['bookmarks'][self.tap_stream_id(t1)]

        self.assertEqual(expected_table_version, bookmark['version'])

        # Either the log_file is the same but the log_pos has increased or the log_file
        # has rotated and the numeric suffix has increased
        if expected_log_file == bookmark['log_file']:
            print("PATH A")
            self.assertGreater(bookmark['log_pos'], expected_log_pos)
        else:
            expected_log_file_suffix = re.search('^.*\.(\d+)$',
                                                 expected_log_file).groups()[0]
            updated_log_file_suffix = re.search(
                '^.*\.(\d+)$', bookmark['log_file']).groups()[0]
            print("PATH B")
            self.assertGreater(int(updated_log_file_suffix),
                               int(expected_log_file_suffix))

        # Execute delete across tables using join prior to 3rd sync
        deleted_id = 4

        with db_utils.get_db_connection(
                self.get_properties(), self.get_credentials()).cursor() as cur:

            delete_time = datetime.datetime.now()
            # DELETE T1, T2
            # FROM T1
            # INNER JOIN T2 ON T1.key = T2.key
            # WHERE condition;
            db = self.database_name()
            db_t1 = db + "." + t1
            db_t2 = db + "." + t2
            t1_key = db_t1 + ".id"
            t2_key = db_t2 + ".id"
            statement = "DELETE {}, {} ".format(db_t1, db_t2) + \
                "FROM {} ".format(t1) + \
                "INNER JOIN {} ON {} = {} ".format(db_t2, t1_key, t2_key) + \
                "WHERE {} = {}".format(t1_key, deleted_id)
            cur.execute(statement)

        print(
            "\n\nMySQL DB Actions." + \
            "\nNAME: {}\nTABLE: {}".format(self.database_name(), self.table_name_2()) + \
            "\nTABLE: {}".format(self.table_name_2()) + \
            "\nEVENTS:  {} records deleted\n\n".format(1)
        )

        # run binlog sync
        sync_job_name = runner.run_sync_mode(self, conn_id)

        # verify tap and target exit codes
        exit_status = menagerie.get_exit_status(conn_id, sync_job_name)
        menagerie.verify_sync_exit_status(self, exit_status, sync_job_name)

        # check that version from state is unchanged
        state = menagerie.get_state(conn_id)
        bookmark = state['bookmarks'][self.tap_stream_id(t1)]

        self.assertEqual(expected_table_version, bookmark['version'])

        target_records = runner.get_records_from_target_output()
        records_stream_1 = target_records[self.table_name_1()]
        upsert_records_1 = [
            m['data'] for m in records_stream_1['messages']
            if m['action'] == 'upsert'
        ]
        records_stream_2 = target_records[self.table_name_2()]
        upsert_records_2 = [
            m['data'] for m in records_stream_2['messages']
            if m['action'] == 'upsert'
        ]

        # make sure the record is in the target for both tables with a delete time
        deleted_at_t1 = upsert_records_1[0].get('_sdc_deleted_at')
        deleted_at_t1_timestamp = utils.strptime_to_utc(
            deleted_at_t1).timestamp()
        self.assertIsNotNone(deleted_at_t1)

        deleted_at_t2 = upsert_records_2[0].get('_sdc_deleted_at')
        deleted_at_t2_timestamp = utils.strptime_to_utc(
            deleted_at_t2).timestamp()
        self.assertIsNotNone(deleted_at_t2)

        # the delete times should be equal since it was a single transaction
        self.assertEqual(deleted_at_t1_timestamp, deleted_at_t2_timestamp)
        time_delta = delete_time.timestamp() - deleted_at_t1_timestamp
        print("Delete time vs record: difference in seconds", time_delta)
        self.assertLess(time_delta,
                        3)  # time delta less than 3 seconds in magnitude