Python examine_target_output_for_fields示例，tap_tester.runner.examine_target_output_for_fields Python示例

示例#1

0

显示文件

    def test_run(self):
        """
        Verify that we can get multiple pages of automatic fields for each
        stream
        """

        conn_id = connections.ensure_connection(self)
        self.run_and_verify_check_mode(conn_id)

        self.select_and_verify_fields(conn_id, select_all_fields=False)

        record_count_by_stream = self.run_and_verify_sync(conn_id)

        actual_fields_by_stream = runner.examine_target_output_for_fields()

        # Assert all expected streams synced at least a full pages of records
        for stream in self.expected_sync_streams():
            with self.subTest(stream=stream):
                self.assertGreater(
                    record_count_by_stream.get(stream, 0),
                    int(self.get_properties()['page_size']),
                    msg="{} did not sync more than a page of records".format(
                        stream))

        for stream_name, actual_fields in actual_fields_by_stream.items():
            with self.subTest(stream=stream_name):
                self.assertSetEqual(
                    self.expected_automatic_fields()[stream_name],
                    actual_fields)

示例#2

0

显示文件

    def test_run(self):
        """
        Verify that for each stream you can get multiple pages of data
        when no fields are selected and only the automatic fields are replicated.

        PREREQUISITE
        For EACH stream add enough data that you surpass the limit of a single
        fetch of data.  For instance if you have a limit of 250 records ensure
        that 251 (or more) records have been posted for that stream.
        """
        self.start_date = '2020-11-10T00:00:00Z'
        conn_id = self.create_connection(original_properties=False)

        # Select all parent streams and no fields within streams
        # Select all (testable) report streams and only fields which are automatic and/or required by bing to genereate a report
        found_catalogs = menagerie.get_catalogs(conn_id)
        test_catalogs = [catalog for catalog in found_catalogs
                       if catalog.get('tap_stream_id') in self.expected_sync_streams()]

        # BUG_SRCE-4313 (https://stitchdata.atlassian.net/browse/SRCE-4313) streams missing automatic fields
        specific_fields = {**self.report_automatic_fields(), **self.parent_automatic_fields()} # COMMENT to reproduce
        # specific_fields = {**self.report_measure_fields(), **self.parent_automatic_fields()} #  UNCOMMENT to reproduce
        # specific_fields = self.report_measure_fields()  # TODO Use this line once bugs addressed.

        self.perform_and_verify_adjusted_selection(
            conn_id, test_catalogs, select_all_fields=False, specific_fields=specific_fields
        )

        # COMMENT EVERYTHING DOWN FROM HERE TO ADDRESS BUG_SRCE-4313

        # Run a sync job using orchestrator
        state = menagerie.get_state(conn_id)
        record_count_by_stream = self.run_and_verify_sync(conn_id, state)

        actual_fields_by_stream = runner.examine_target_output_for_fields()

        for stream in self.expected_sync_streams():
            with self.subTest(stream=stream):

                if stream == 'goals_and_funnels_report':  # SKIP TESTING FOR THIS STREAM
                    continue  # There is no data available, since we would need to implement a tracking script on singer's site

                # verify that you get some records for each stream
                self.assertGreater(
                    record_count_by_stream.get(stream, -1), 0,
                    msg="The number of records is not over the stream max limit")

                # verify that only the automatic fields are sent to the target for parent streams, and that
                # automatic fields, _sdc_report_datetime, AND specific measure fields are sent to target for report streams
                actual = actual_fields_by_stream.get(stream) or set()
                expected = self.expected_automatic_fields().get(stream, set())
                if stream.endswith('_report'):  # update expectations for report streams
                    expected_measure = 'Assists' if stream.startswith('goals') else 'Clicks'
                    expected.update({
                        '_sdc_report_datetime',  # tap applies sdc value as pk for all reports
                        expected_measure  # reports require a perf measure (which is intentionally not automatic)
                    })

                self.assertSetEqual(expected, actual)

示例#3

0

显示文件

    def automatic_test(self, conn_id, testable_streams):
        """
        Verify that for each stream you can get multiple pages of data
        when no fields are selected and only the automatic fields are replicated.

        PREREQUISITE
        For EACH stream add enough data that you surpass the limit of a single
        fetch of data.  For instance if you have a limit of 250 records ensure
        that 251 (or more) records have been posted for that stream.
        """
        incremental_streams = {
            key
            for key, value in self.expected_replication_method().items()
            if value == self.INCREMENTAL and key in testable_streams
        }

        # Select all streams and no fields within streams
        # IF THERE ARE NO AUTOMATIC FIELDS FOR A STREAM
        # WE WILL NEED TO UPDATE THE BELOW TO SELECT ONE
        found_catalogs = menagerie.get_catalogs(conn_id)
        our_catalogs = [
            catalog for catalog in found_catalogs
            if catalog.get('tap_stream_id') in incremental_streams
        ]
        self.select_all_streams_and_fields(conn_id,
                                           our_catalogs,
                                           select_all_fields=False)

        # Run a sync job using orchestrator
        record_count_by_stream = self.run_sync(conn_id)

        actual_fields_by_stream = runner.examine_target_output_for_fields()

        for stream in incremental_streams:
            with self.subTest(stream=stream):

                # verify that you get more than a page of data
                # SKIP THIS ASSERTION FOR STREAMS WHERE YOU CANNOT GET
                # MORE THAN 1 PAGE OF DATA IN THE TEST ACCOUNT
                stream_metadata = self.expected_metadata().get(stream, {})
                minimum_record_count = stream_metadata.get(
                    self.API_LIMIT,
                    self.get_properties().get('result_per_page',
                                              self.DEFAULT_RESULTS_PER_PAGE))
                self.assertGreater(
                    record_count_by_stream.get(stream, -1),
                    minimum_record_count,
                    msg="The number of records is not over the stream max limit"
                )

                # verify that only the automatic fields are sent to the target
                self.assertEqual(
                    actual_fields_by_stream.get(stream, set()),
                    self.expected_primary_keys().get(stream, set())
                    | self.expected_replication_keys().get(stream, set()),
                    msg=
                    "The fields sent to the target are not the automatic fields"
                )

示例#4

0

显示文件

    def do_test(self, conn_id):
        """
        Verify that for each stream you can get multiple pages of data
        and that when all fields are selected more than the automatic fields are replicated.

        PREREQUISITE
        For EACH stream add enough data that you surpass the limit of a single
        fetch of data.  For instance if you have a limit of 250 records ensure
        that 251 (or more) records have been posted for that stream.
        """
        # Select all streams and all fields within streams
        found_catalogs = menagerie.get_catalogs(conn_id)
        self.select_all_streams_and_fields(conn_id,
                                           found_catalogs,
                                           select_all_fields=True)

        # Run a sync job using orchestrator
        record_count_by_stream = self.run_sync(conn_id)

        actual_fields_by_stream = runner.examine_target_output_for_fields()

        for stream in self.expected_streams():
            with self.subTest(stream=stream):

                # verify that we can paginate with all fields selected
                self.assertGreater(
                    record_count_by_stream.get(stream, -1),
                    self.expected_metadata().get(stream,
                                                 {}).get(self.API_LIMIT, 0),
                    msg="The number of records is not over the stream max limit"
                )

                # verify that the automatic fields are sent to the target
                self.assertTrue(
                    actual_fields_by_stream.get(stream, set()).issuperset(
                        self.expected_primary_keys().get(stream, set())
                        | self.top_level_replication_key_fields().get(
                            stream, set())
                        | self.expected_foreign_keys().get(stream, set())),
                    msg=
                    "The fields sent to the target don't include all automatic fields"
                )

                # verify we have more fields sent to the target than just automatic fields
                # SKIP THIS ASSERTION IF ALL FIELDS ARE INTENTIONALLY AUTOMATIC FOR THIS STREAM
                self.assertTrue(
                    actual_fields_by_stream.get(stream, set(
                    )).symmetric_difference(
                        self.expected_primary_keys().get(stream, set())
                        | self.expected_replication_keys().get(stream, set())
                        | self.expected_foreign_keys().get(stream, set())),
                    msg=
                    "The fields sent to the target don't include non-automatic fields"
                )

示例#5

0

显示文件

文件： test_automatic_fields.py 项目： BazaarvoiceBizTech/tap-jira

    def do_test(self, conn_id):
        """
        Verify that for each stream you can get multiple pages of data
        when no fields are selected and only the automatic fields are replicated.

        PREREQUISITE
        For EACH stream add enough data that you surpass the limit of a single
        fetch of data.  For instance if you have a limit of 250 records ensure
        that 251 (or more) records have been posted for that stream.
        """
        self.create_test_data()

        # Select all streams and no fields within streams
        # IF THERE ARE NO AUTOMATIC FIELDS FOR A STREAM
        # WE WILL NEED TO UPDATE THE BELOW TO SELECT ONE
        found_catalogs = menagerie.get_catalogs(conn_id)
        self.select_all_streams_and_fields(conn_id,
                                           found_catalogs,
                                           select_all_fields=False)

        # Run a sync job using orchestrator
        record_count_by_stream = self.run_sync(conn_id)

        actual_fields_by_stream = runner.examine_target_output_for_fields()

        for stream in self.expected_streams():
            with self.subTest(stream=stream):
                # verify that you get more than a page of data
                # SKIP THIS ASSERTION FOR STREAMS WHERE YOU CANNOT GET
                # MORE THAN 1 PAGE OF DATA IN THE TEST ACCOUNT
                self.assertGreater(
                    record_count_by_stream.get(stream, -1),
                    self.expected_metadata().get(stream,
                                                 {}).get(self.API_LIMIT),
                    msg="The number of records is not over the stream max limit"
                )

                # verify that only the automatic fields are sent to the target
                expected_fields_for_stream = (
                    self.expected_primary_keys().get(stream, set()) |
                    self.top_level_replication_key_fields().get(stream, set())
                    | self.expected_foreign_keys().get(stream, set()))
                self.assertEqual(
                    actual_fields_by_stream.get(stream, set()),
                    expected_fields_for_stream,
                    msg=
                    "The fields sent to the target are not the automatic fields.\nExpected: {}\nActual: {}"
                    .format(expected_fields_for_stream,
                            actual_fields_by_stream.get(stream, set())))

示例#6

0

显示文件

    def test_run(self):
        conn_id = connections.ensure_connection(self)

        found_catalogs = self.run_and_verify_check_mode(conn_id)

        # Select only the expected streams tables
        expected_streams = self.testable_streams()
        catalog_entries = [
            ce for ce in found_catalogs
            if ce['tap_stream_id'] in expected_streams
        ]

        for catalog_entry in catalog_entries:
            stream_schema = menagerie.get_annotated_schema(
                conn_id, catalog_entry['stream_id'])
            connections.select_catalog_and_fields_via_metadata(
                conn_id, catalog_entry, stream_schema)

        # Run sync
        first_record_count_by_stream = self.run_and_verify_sync(conn_id)

        replicated_row_count = sum(first_record_count_by_stream.values())
        synced_records = runner.get_records_from_target_output()

        # Test by Stream
        for stream in self.testable_streams():
            with self.subTest(stream=stream):

                expected_fields = set(
                    synced_records.get(stream)['schema']['properties'].keys())
                print('Number of expected keys ', len(expected_fields))
                actual_fields = set(
                    runner.examine_target_output_for_fields()[stream])
                print('Number of actual keys ', len(actual_fields))
                print('Number of known missing keys ',
                      len(KNOWN_MISSING_FIELDS[stream]))

                unexpected_fields = actual_fields & KNOWN_MISSING_FIELDS[stream]
                if unexpected_fields:
                    print('WARNING: Found new fields: {}'.format(
                        unexpected_fields))
                self.assertSetEqual(
                    expected_fields,
                    actual_fields | KNOWN_MISSING_FIELDS[stream])

示例#7

0

显示文件

    def test_run(self):
        """
        Verify that for each stream you can get multiple pages of data
        and that when all fields are selected more than the automatic fields are replicated.

        PREREQUISITE
        For EACH stream add enough data that you surpass the limit of a single
        fetch of data.  For instance if you have a limit of 250 records ensure
        that 251 (or more) records have been posted for that stream.
        """
        conn_id = connections.ensure_connection(self)

        incremental_streams = {key for key, value in self.expected_replication_method().items()
                               if value == self.INCREMENTAL}
        untested_streams = self.child_streams().union({
            'balance_transactions',
            # 'charges',
            # 'coupons',
            # 'customers',
            'disputes',
            # 'invoice_items',
            'invoice_line_items',
            # 'invoices',
            'payout_transactions',
            # 'payouts',
            # 'plans',
            # 'products',
            'subscription_items',
            # 'subscriptions',
            'transfers',
        })
        tested_streams = incremental_streams.difference(untested_streams)
        
        # Select all streams and all fields within streams
        found_catalogs = self.run_and_verify_check_mode(conn_id)
        our_catalogs = get_catalogs(conn_id, tested_streams)
        self.select_all_streams_and_fields(conn_id, our_catalogs, select_all_fields=True)

        # Ensure tested streams have a record count which exceeds the API LIMIT
        logging.info("Checking record counts for tested streams...")
        streams_to_create = {}
        for stream in tested_streams:
            records = list_all_object(stream)
            record_count = len(records)

            streams_to_create[stream] = record_count
            logging.info("   Stream {} has {} records created today".format(stream, record_count))

        logging.info("Creating records for tested streams...")
        new_objects = {stream: [] for stream in streams_to_create}
        for stream in streams_to_create:
            if stream != "events" and streams_to_create[stream] <= self.API_LIMIT:
                while streams_to_create[stream] <= self.API_LIMIT:
                    logging.info("Creating a record for {} | {} records created today ".format(stream,
                                                                                        streams_to_create[stream]))
                    new_objects[stream].append(create_object(stream))
                    streams_to_create[stream] += 1
                records = list_all_object(stream)
                self.assertEqual(100, len(records))
                logging.info("   Stream {} has at least {} records created today".format(stream, len(records) + 1))

        # Run a sync job using orchestrator
        record_count_by_stream = self.run_and_verify_sync(conn_id)

        actual_fields_by_stream = runner.examine_target_output_for_fields()

        for stream in incremental_streams.difference(untested_streams):
            with self.subTest(stream=stream):

                # verify that we can paginate with all fields selected
                self.assertGreater(
                    record_count_by_stream.get(stream, -1),
                    self.expected_metadata().get(stream, {}).get(self.API_LIMIT, 0),
                    msg="The number of records is not over the stream max limit")

                # verify that the automatic fields are sent to the target
                actual = actual_fields_by_stream.get(stream) or set()
                expected = self.expected_automatic_fields().get(stream, set())
                self.assertTrue(actual.issuperset(expected),
                                msg="The fields sent to the target don't include all automatic fields. "
                                "Expected: {}, Actual: {}". format(expected, actual)
                )

                # verify we have more fields sent to the target than just automatic fields
                # SKIP THIS ASSERTION IF ALL FIELDS ARE INTENTIONALLY AUTOMATIC FOR THIS STREAM
                actual = actual_fields_by_stream.get(stream) or set()
                expected = self.expected_automatic_fields().get(stream, set())
                self.assertTrue(actual.symmetric_difference(expected),
                                msg="The fields sent to the target don't include any non-automatic fields"
                )

                if stream != "events":
                    actual = actual_fields_by_stream.get(stream, set())
                    expected = set(new_objects[stream][0].keys())

示例#8

0

显示文件

文件： test_pagination.py 项目： tomslutsky/tap-shopify

    def test_run(self):
        """
        Verify that for each stream you can get multiple pages of data
        and that when all fields are selected more than the automatic fields are replicated.

        PREREQUISITE
        For EACH stream add enough data that you surpass the limit of a single
        fetch of data.  For instance if you have a limit of 250 records ensure
        that 251 (or more) records have been posted for that stream.
        """
        conn_id = self.create_connection()

        # Select all streams and all fields within streams
        found_catalogs = menagerie.get_catalogs(conn_id)
        incremental_streams = {
            key
            for key, value in self.expected_replication_method().items()
            if value == self.INCREMENTAL
        }

        untested_streams = self.child_streams().union({
            'abandoned_checkouts', 'collects', 'metafields', 'transactions',
            'order_refunds'
        })
        our_catalogs = [
            catalog for catalog in found_catalogs
            if catalog.get('tap_stream_id') in incremental_streams.difference(
                untested_streams)
        ]

        self.select_all_streams_and_fields(conn_id,
                                           our_catalogs,
                                           select_all_fields=True)
        # Run a sync job using orchestrator
        record_count_by_stream = self.run_sync(conn_id)
        actual_fields_by_stream = runner.examine_target_output_for_fields()

        for stream in self.expected_streams().difference(untested_streams):
            with self.subTest(stream=stream):

                # verify that we can paginate with all fields selected
                self.assertGreater(
                    record_count_by_stream.get(stream, -1),
                    self.expected_metadata().get(stream,
                                                 {}).get(self.API_LIMIT, 0),
                    msg="The number of records is not over the stream max limit"
                )

                # verify that the automatic fields are sent to the target
                self.assertTrue(
                    actual_fields_by_stream.get(stream, set()).issuperset(
                        self.expected_primary_keys().get(stream, set())
                        | self.expected_replication_keys().get(stream, set())
                        | self.expected_foreign_keys().get(stream, set())),
                    msg=
                    "The fields sent to the target don't include all automatic fields"
                )

                # verify we have more fields sent to the target than just automatic fields
                # SKIP THIS ASSERTION IF ALL FIELDS ARE INTENTIONALLY AUTOMATIC FOR THIS STREAM
                self.assertTrue(
                    actual_fields_by_stream.get(stream, set(
                    )).symmetric_difference(
                        self.expected_primary_keys().get(stream, set())
                        | self.expected_replication_keys().get(stream, set())
                        | self.expected_foreign_keys().get(stream, set())),
                    msg=
                    "The fields sent to the target don't include non-automatic fields"
                )

示例#9

0

显示文件

    def test_run(self):
        """
        Verify that a bookmark doesn't exist for the stream
        Verify that the second sync includes the same number or more records than the first sync
        Verify that all records in the first sync are included in the second sync
        Verify that the sync only sent records to the target for selected streams (catalogs)

        PREREQUISITE
        For EACH stream that is fully replicated there are multiple rows of data with
            different values for the replication key
        """
        print("running test {}".format(self.name()))

        conn_id = self.create_connection()

        # Select all streams and no fields within streams
        found_catalogs = menagerie.get_catalogs(conn_id)
        full_streams = {
            key
            for key, value in self.expected_replication_method().items()
            if value == self.FULL
        }
        our_catalogs = [
            catalog for catalog in found_catalogs
            if catalog.get('tap_stream_id') in full_streams
        ]
        self.select_all_streams_and_fields(conn_id,
                                           our_catalogs,
                                           select_all_fields=True)

        # Run a sync job using orchestrator
        first_sync_record_count = self.run_sync(conn_id)

        # verify that the sync only sent records to the target for selected streams (catalogs)
        self.assertEqual(set(first_sync_record_count.keys()), full_streams)

        first_sync_state = menagerie.get_state(conn_id)

        # Get the set of records from a first sync
        first_sync_records = runner.get_records_from_target_output()

        # Get the fields for each stream from the first sync
        actual_fields_by_stream = runner.examine_target_output_for_fields()

        # Run a second sync job using orchestrator
        second_sync_record_count = self.run_sync(conn_id)

        # Get the set of records from a second sync
        second_sync_records = runner.get_records_from_target_output()

        # THIS MAKES AN ASSUMPTION THAT CHILD STREAMS DO NOT NEED TESTING.
        # ADJUST IF NECESSARY
        for stream in full_streams.difference(self.child_streams()):
            with self.subTest(stream=stream):

                # verify there is no bookmark values from state
                state_value = first_sync_state.get("bookmarks", {}).get(stream)
                self.assertIsNone(state_value)

                # verify that there is more than 1 record of data - setup necessary
                self.assertGreater(
                    first_sync_record_count.get(stream, 0),
                    1,
                    msg="Data isn't set up to be able to test full sync")

                # verify that you get the same or more data the 2nd time around
                self.assertGreaterEqual(
                    second_sync_record_count.get(stream, 0),
                    first_sync_record_count.get(stream, 0),
                    msg=
                    "second syc didn't have more records, full sync not verified"
                )

                # verify all data from 1st sync included in 2nd sync
                first_data = [
                    record["data"] for record in first_sync_records.get(
                        stream, {}).get("messages", {"data": {}})
                ]
                second_data = [
                    record["data"] for record in second_sync_records.get(
                        stream, {}).get("messages", {"data": {}})
                ]

                same_records = 0
                for first_record in first_data:
                    first_value = json.dumps(first_record, sort_keys=True)

                    for compare_record in second_data:
                        compare_value = json.dumps(compare_record,
                                                   sort_keys=True)

                        if first_value == compare_value:
                            second_data.remove(compare_record)
                            same_records += 1
                            break

                self.assertEqual(
                    len(first_data),
                    same_records,
                    msg=
                    "Not all data from the first sync was in the second sync")

                # verify we have more fields sent to the target than just automatic fields
                # SKIP THIS ASSERTION IF ALL FIELDS ARE INTENTIONALLY AUTOMATIC FOR THIS STREAM
                self.assertTrue(
                    actual_fields_by_stream.get(stream, set(
                    )).symmetric_difference(
                        self.expected_primary_keys().get(stream, set())
                        | self.expected_replication_keys().get(stream, set())
                        | self.expected_foreign_keys().get(stream, set())),
                    msg=
                    "The fields sent to the target don't include non-automatic fields"
                )

示例#10

0

显示文件

    def test_run(self):
        """
        Verify that for each stream you can get multiple pages of data
        and that when all fields are selected more than the automatic fields are replicated.

        PREREQUISITE
        For EACH stream add enough data that you surpass the limit of a single
        fetch of data.  For instance if you have a limit of 250 records ensure
        that 251 (or more) records have been posted for that stream.
        """

        conn_id = self.create_connection_with_initial_discovery()

        self.create_test_data()

        # Select all streams and all fields within streams
        found_catalogs = menagerie.get_catalogs(conn_id)
        self.select_all_streams_and_fields(conn_id, found_catalogs, select_all_fields=True)

        # Run a sync job using orchestrator
        record_count_by_stream = self.run_sync(conn_id)

        actual_fields_by_stream = runner.examine_target_output_for_fields()
      
        synced_recs = runner.get_records_from_target_output()

        for stream in self.expected_streams():
            with self.subTest(stream=stream):

                # gather expectations
                expected_pks = self.expected_primary_keys()[stream]

                # gather results
                record_count = record_count_by_stream.get(stream, -1)
                api_limit = self.expected_metadata().get(stream, {}).get(self.API_LIMIT)
                replicated_fields = actual_fields_by_stream.get(stream, set())
                pk_value_list = [
                    tuple(message.get("data").get(pk) for pk in expected_pks)
                    for message in synced_recs[stream].get("messages", [])
                    if message["action"] == "upsert"
                ]
                unique_pk_values = set(pk_value_list)
                
                # verify that we can paginate with all fields selected
                self.assertGreater(
                    record_count, api_limit,
                    logging="verify the number of records replicated exceeds the stream api limit"
                )

                # verify that the automatic fields are sent to the target
                self.assertTrue(
                    replicated_fields.issuperset(
                        self.expected_primary_keys().get(stream, set()) |
                        self.top_level_replication_key_fields().get(stream, set()) |
                        self.expected_foreign_keys().get(stream, set())),
                    logging="verify the automatic fields are sent to the target"
                )

                # verify we have more fields sent to the target than just automatic fields
                # SKIP THIS ASSERTION IF ALL FIELDS ARE INTENTIONALLY AUTOMATIC FOR THIS STREAM
                self.assertTrue(
                    replicated_fields.difference(
                        self.expected_primary_keys().get(stream, set()) |
                        self.expected_replication_keys().get(stream, set())
                    ),
                    logging="verify more than just the automatic fields are sent to the target"
                )

                # verify no records have dulpicate primary-keys value
                self.assertEqual(len(pk_value_list), len(unique_pk_values),
                                 logging="verify records have unique primary key values")

示例#11

0

显示文件

    def test_run(self):
        """
        - Verify that for each stream you can get multiple pages of data
        - when no fields are selected and only the automatic fields are replicated.
        - Verify that all replicated records have unique primary key values

        PREREQUISITE
        For EACH stream add enough data that you surpass the limit of a single
        fetch of data.  For instance if you have a limit of 250 records ensure
        that 251 (or more) records have been posted for that stream.
        """
        conn_id = self.create_connection_with_initial_discovery()

        self.create_test_data()

        # Select all streams and no fields within streams
        # IF THERE ARE NO AUTOMATIC FIELDS FOR A STREAM
        # WE WILL NEED TO UPDATE THE BELOW TO SELECT ONE
        found_catalogs = menagerie.get_catalogs(conn_id)

        expected_streams = self.expected_streams()
        our_catalogs = [
            catalog for catalog in found_catalogs
            if catalog.get('tap_stream_id') in expected_streams
        ]

        self.select_all_streams_and_fields(conn_id,
                                           our_catalogs,
                                           select_all_fields=False)

        # Run a sync job using orchestrator
        record_count_by_stream = self.run_sync(conn_id)

        actual_fields_by_stream = runner.examine_target_output_for_fields()
        synced_records = runner.get_records_from_target_output()

        for stream in expected_streams:
            with self.subTest(stream=stream):

                # gather expectations
                expected_primary_keys = self.expected_primary_keys().get(
                    stream, set())
                expected_automatic_fields = (
                    expected_primary_keys |
                    self.top_level_replication_key_fields().get(stream, set())
                    | self.expected_foreign_keys().get(stream, set()))
                api_limit = self.expected_metadata().get(stream, {}).get(
                    self.API_LIMIT)

                # collect results
                messages = synced_records.get(stream)
                record_count = record_count_by_stream.get(stream, -1)
                fields_replicated = actual_fields_by_stream.get(stream, set())
                records_pks_list = [
                    tuple([
                        message.get('data').get(primary_key)
                        for primary_key in expected_primary_keys
                    ]) for message in messages.get('messages')
                ]

                # verify that you get more than a page of data
                self.assertGreater(
                    record_count,
                    api_limit,
                    logging="verify multiple pages are replicated")

                # verify that only the automatic fields are sent to the target
                self.assertEqual(
                    fields_replicated,
                    expected_automatic_fields,
                    logging="verify only automatic fields are replicated")

                # Verify that all replicated records have unique primary key values
                self.assertCountEqual(
                    set(records_pks_list),
                    records_pks_list,
                    msg="We have duplicate records for {}".format(stream),
                    logging="verify all records have unique primary key values"
                )

示例#12

0

显示文件

    def run_test(self, streams):
        """
        Testing that the pagination works when there are records greater than the page size
        - Verify for each stream you can get multiple pages of data
        - Verify by pks that the data replicated matches the data we expect.
        """

        expected_streams = streams
        conn_id = connections.ensure_connection(self)

        # Select all streams and all fields within streams
        found_catalogs = self.run_and_verify_check_mode(conn_id)

        self.select_found_catalogs(conn_id,
                                   found_catalogs,
                                   only_streams=expected_streams)

        # Run a sync job using orchestrator
        record_count_by_stream = self.run_and_verify_sync(
            conn_id, expected_streams)
        actual_fields_by_stream = runner.examine_target_output_for_fields()
        sync_records = runner.get_records_from_target_output()

        for stream in expected_streams:
            with self.subTest(stream=stream):

                # verify that we can paginate with all fields selected
                minimum_record_count = self.page_size

                self.assertGreater(
                    record_count_by_stream.get(stream, -1),
                    minimum_record_count,
                    msg="The number of records is not over the stream max limit"
                )

                expected_primary_key = self.expected_primary_keys().get(
                    stream, set())
                sync_messages = sync_records.get(stream, {
                    'messages': []
                }).get('messages')
                expected_replication_key = self.expected_replication_keys(
                ).get(stream, set())
                expected_automatic_fields = expected_primary_key | expected_replication_key

                # verify that the automatic fields are sent to the target
                self.assertTrue(
                    actual_fields_by_stream.get(
                        stream, set()).issuperset(expected_automatic_fields),
                    msg=
                    "The fields sent to the target don't include all automatic fields"
                )

                # verify we have more fields sent to the target than just automatic fields
                self.assertTrue(
                    actual_fields_by_stream.get(
                        stream,
                        set()).symmetric_difference(expected_automatic_fields),
                    msg=
                    "The fields sent to the target don't include non-automatic fields"
                )

                # Verify we did not duplicate any records across pages
                records_pks_list = [
                    tuple([
                        message.get('data').get(primary_key)
                        for primary_key in expected_primary_key
                    ]) for message in sync_messages
                ]

                self.assertCountEqual(
                    records_pks_list,
                    set(records_pks_list),
                    msg=f"We have duplicate records for {stream}")

示例#13

0

显示文件

    def test_run(self):
        """
        Verify that for each stream you can get multiple pages of data
        when no fields are selected and only the automatic fields are replicated.

        PREREQUISITE
        For EACH stream add enough data that you surpass the limit of a single
        fetch of data.  For instance if you have a limit of 250 records ensure
        that 251 (or more) records have been posted for that stream.
        """
        conn_id = connections.ensure_connection(self)
        streams_to_create = {
            # "balance_transactions",  # should be created implicity with a create in the payouts or charges streams
            "charges",
            "coupons",
            "customers",
            "invoice_items",
            "invoice_line_items",  # this is created implicity by invoices, it just creates another invoice
            "invoices",  # this will create an invoice_item
            "payouts",
            "plans",
            "products",
            "subscription_items",
            "subscriptions",  # this will create a new plan and payment method
        }
        untested_streams = {"disputes", "transfers", "payout_transactions"}
        new_objects = {
            stream: create_object(stream)
            for stream in streams_to_create.difference()
        }

        # Select all streams and no fields within streams
        # IF THERE ARE NO AUTOMATIC FIELDS FOR A STREAM
        # WE WILL NEED TO UPDATE THE BELOW TO SELECT ONE
        found_catalogs = self.run_and_verify_check_mode(conn_id)
        self.select_all_streams_and_fields(conn_id,
                                           found_catalogs,
                                           select_all_fields=False)

        # Run a sync job using orchestrator
        record_count_by_stream = self.run_and_verify_sync(conn_id)

        actual_fields_by_stream = runner.examine_target_output_for_fields()

        for stream in self.expected_streams().difference(untested_streams):
            with self.subTest(stream=stream):

                # verify that you get some records for each stream
                # SKIP THIS ASSERTION FOR STREAMS WHERE YOU CANNOT GET
                # MORE THAN 1 PAGE OF DATA IN THE TEST ACCOUNT
                self.assertGreater(
                    record_count_by_stream.get(stream, -1),
                    0,
                    msg="The number of records is not over the stream max limit"
                )

                # verify that only the automatic fields are sent to the target
                actual = actual_fields_by_stream.get(stream) or set()
                expected = self.expected_automatic_fields().get(stream, set())
                self.assertEqual(
                    actual,
                    expected,
                    msg=
                    ("The fields sent to the target are not the automatic fields. Expected: {}, Actual: {}"
                     .format(actual, expected)))

示例#14

0

显示文件

    def pagination_test(self, conn_id, testable_streams):
        """
        Verify that for each stream you can get multiple pages of data
        and that when all fields are selected more than the automatic fields are replicated.

        PREREQUISITE
        For EACH stream add enough data that you surpass the limit of a single
        fetch of data.  For instance if you have a limit of 250 records ensure
        that 251 (or more) records have been posted for that stream.
        """

        # Select all streams and all fields within streams
        found_catalogs = menagerie.get_catalogs(conn_id)
        incremental_streams = {
            key
            for key, value in self.expected_replication_method().items()
            if value == self.INCREMENTAL and key in testable_streams
        }

        # our_catalogs = [catalog for catalog in found_catalogs if
        #                 catalog.get('tap_stream_id') in incremental_streams.difference(
        #                     untested_streams)]
        our_catalogs = [
            catalog for catalog in found_catalogs
            if catalog.get('tap_stream_id') in testable_streams
        ]

        self.select_all_streams_and_fields(conn_id,
                                           our_catalogs,
                                           select_all_fields=True)
        # Run a sync job using orchestrator
        record_count_by_stream = self.run_sync(conn_id)
        actual_fields_by_stream = runner.examine_target_output_for_fields()
        sync_records = runner.get_records_from_target_output()

        api_limit = int(self.get_properties().get(
            'results_per_page', self.DEFAULT_RESULTS_PER_PAGE))

        for stream in testable_streams:
            with self.subTest(stream=stream):

                # verify that we can paginate with all fields selected
                stream_metadata = self.expected_metadata().get(stream, {})
                minimum_record_count = 100 if stream == 'transactions' else api_limit
                self.assertGreater(
                    record_count_by_stream.get(stream, -1),
                    minimum_record_count,
                    msg="The number of records is not over the stream max limit"
                )

                expected_pk = self.expected_primary_keys()
                sync_messages = sync_records.get(stream, {
                    'messages': []
                }).get('messages')

                # verify that the automatic fields are sent to the target
                self.assertTrue(
                    actual_fields_by_stream.get(stream, set()).issuperset(
                        expected_pk.get(stream, set())
                        | self.expected_replication_keys().get(stream, set())
                        | self.expected_foreign_keys().get(stream, set())),
                    msg=
                    "The fields sent to the target don't include all automatic fields"
                )

                # verify we have more fields sent to the target than just automatic fields
                # SKIP THIS ASSERTION IF ALL FIELDS ARE INTENTIONALLY AUTOMATIC FOR THIS STREAM
                self.assertTrue(
                    actual_fields_by_stream.get(stream, set(
                    )).symmetric_difference(
                        expected_pk.get(stream, set())
                        | self.expected_replication_keys().get(stream, set())
                        | self.expected_foreign_keys().get(stream, set())),
                    msg=
                    "The fields sent to the target don't include non-automatic fields"
                )

                # Verify we did not duplicate any records across pages
                records_pks_set = {
                    tuple([
                        message.get('data').get(primary_key)
                        for primary_key in expected_pk.get(stream, set())
                    ])
                    for message in sync_messages
                }
                records_pks_list = [
                    tuple([
                        message.get('data').get(primary_key)
                        for primary_key in expected_pk.get(stream, set())
                    ]) for message in sync_messages
                ]
                self.assertCountEqual(
                    records_pks_set,
                    records_pks_list,
                    msg=f"We have duplicate records for {stream}")

示例#15

0

显示文件

文件： test_harvest_pagination.py 项目： tomusher/tap-harvest

    def do_test(self, conn_id):
        """
        Verify that for each stream you can get multiple pages of data
        and that when all fields are selected more than the automatic fields are replicated.

        PREREQUISITE
        For EACH stream add enough data that you surpass the limit of a single
        fetch of data.  For instance if you have a limit of 250 records ensure
        that 251 (or more) records have been posted for that stream.
        """

        # Select all streams and all fields within streams
        found_catalogs = menagerie.get_catalogs(conn_id)
        # self.select_all_streams_and_fields(conn_id, found_catalogs, select_all_fields=True)

        # Run a sync job using orchestrator
        record_count_by_stream = self.run_sync(conn_id)

        actual_fields_by_stream = runner.examine_target_output_for_fields()

        untested_streams = [
            stream for stream in self._master
            if not self._master[stream]['test']
        ]

        for stream in self.expected_streams().difference(
                set(untested_streams)):
            with self.subTest(stream=stream):
                logging.info("Testing " + stream)
                # verify that we can paginate with all fields selected
                self.assertGreater(
                    record_count_by_stream.get(stream, -1),
                    self.expected_metadata().get(stream,
                                                 {}).get(self.API_LIMIT, 0),
                    msg="The number of records is not over the stream max limit"
                )

                # TODO - change following assertion to assertEqual and capture all fields
                # Note - This ^ is nontrivial for fileds which span multiple streams
                #  ex. {evet_type: send} in estimate_messages = {sent_at: time} in estimates

                # verify the target recieves all possible fields for a given stream
                self.assertEqual(
                    set(),
                    self._master[stream]["expected_fields"].difference(
                        actual_fields_by_stream.get(stream, set())),
                    msg=
                    "The fields sent to the target have an extra or missing field"
                )

                # verify that the automatic fields are sent to the target for non-child streams
                if not self._master[stream]["child"]:
                    self.assertTrue(
                        actual_fields_by_stream.get(stream, set()).issuperset(
                            self.expected_primary_keys().get(stream, set())
                            | self.expected_replication_keys().get(
                                stream, set())
                            | self.expected_foreign_keys().get(stream, set())),
                        msg=
                        "The fields sent to the target don't include all automatic fields"
                    )

示例#16

0

显示文件

    def test_run(self):
        """
        Verify that for each stream you can get multiple pages of data
        when no fields are selected and only the automatic fields are replicated.

        PREREQUISITE
        For EACH stream add enough data that you surpass the limit of a single
        fetch of data.  For instance if you have a limit of 250 records ensure
        that 251 (or more) records have been posted for that stream.
        """

        print("running test {}".format(self.name()))

        conn_id = self.create_connection()

        # Select all streams and no fields within streams
        # IF THERE ARE NO AUTOMATIC FIELDS FOR A STREAM
        # WE WILL NEED TO UPDATE THE BELOW TO SELECT ONE
        found_catalogs = menagerie.get_catalogs(conn_id)
        additional_md = [{
            "breadcrumb": [],
            "metadata": {
                'replication-method': 'FULL_TABLE'
            }
        }]
        self.select_all_streams_and_fields(conn_id,
                                           found_catalogs,
                                           select_all_fields=False,
                                           additional_md=additional_md,
                                           non_selected_properties=[
                                               "MySmallIntColumn",
                                               "MyBigIntColumn",
                                               "MyTinyIntColumn", "my_boolean",
                                               "MyIntColumn"
                                           ])

        # Run a sync job using orchestrator
        menagerie.set_state(conn_id, {})
        record_count_by_stream = self.run_sync(conn_id)

        actual_fields_by_stream = runner.examine_target_output_for_fields()

        for stream in self.expected_streams():
            with self.subTest(stream=stream):

                # verify that you get more than a page of data TODO this isn't really testing this...
                # SKIP THIS ASSERTION FOR STREAMS WHERE YOU CANNOT GET
                # MORE THAN 1 PAGE OF DATA IN THE TEST ACCOUNT
                self.assertGreater(
                    record_count_by_stream.get(stream, -1),
                    self.expected_metadata().get(stream,
                                                 {}).get(self.API_LIMIT, 0),
                    msg="The number of records is not over the stream max limit"
                )

                # verify that only the automatic fields are sent to the target
                self.assertEqual(
                    actual_fields_by_stream.get(stream, set()),
                    self.expected_primary_keys_by_stream_id().get(
                        stream, set())
                    | self.expected_replication_keys().get(stream, set())
                    | self.expected_foreign_keys().get(stream, set()),
                    msg=
                    "The fields sent to the target are not the automatic fields"
                )