示例#1
0
    def run_test(self):
        conn_id = connections.ensure_connection(self)

        # run in discovery mode
        check_job_name = runner.run_check_mode(self, conn_id)

        # verify check  exit codes
        exit_status = menagerie.get_exit_status(conn_id, check_job_name)
        menagerie.verify_check_exit_status(self, exit_status, check_job_name)

        # verify the tap discovered the right streams
        catalog = menagerie.get_catalogs(conn_id)
        found_catalog_names = set(map(lambda c: c['tap_stream_id'], catalog))

        # assert we find the correct streams
        self.assertEqual(self.expected_check_streams(), found_catalog_names)

        for tap_stream_id in self.expected_check_streams():
            found_stream = [
                c for c in catalog if c['tap_stream_id'] == tap_stream_id
            ][0]
            schema_and_metadata = menagerie.get_annotated_schema(
                conn_id, found_stream['stream_id'])
            main_metadata = schema_and_metadata["metadata"]
            stream_metadata = [
                mdata for mdata in main_metadata if mdata["breadcrumb"] == []
            ]

            # assert that the pks are correct
            self.assertEqual(
                self.expected_pks()[tap_stream_id],
                set(stream_metadata[0]['metadata']['table-key-properties']))

        for stream_catalog in catalog:
            annotated_schema = menagerie.get_annotated_schema(
                conn_id, stream_catalog['stream_id'])
            selected_metadata = connections.select_catalog_and_fields_via_metadata(
                conn_id, stream_catalog, annotated_schema['annotated-schema'],
                [])

        # Run sync
        sync_job_name = runner.run_sync_mode(self, conn_id)

        exit_status = menagerie.get_exit_status(conn_id, sync_job_name)
        menagerie.verify_sync_exit_status(self, exit_status, sync_job_name)

        # verify the persisted schema was correct
        messages_by_stream = runner.get_records_from_target_output()

        # assert that each of the streams that we synced are the ones that we expect to see
        record_count_by_stream = runner.examine_target_output_file(
            self, conn_id, self.expected_first_sync_streams(),
            self.expected_pks())

        # Verify that the full table was syncd
        for tap_stream_id in self.expected_first_sync_streams():
            self.assertEqual(
                self.expected_first_sync_row_counts()[tap_stream_id],
                record_count_by_stream[tap_stream_id])
示例#2
0
    def select_all_streams_and_fields(self,
                                      conn_id,
                                      catalogs,
                                      select_all_fields: bool = True,
                                      select_default_fields: bool = False):
        """Select all streams and all fields within streams"""

        for catalog in catalogs:
            schema = menagerie.get_annotated_schema(conn_id,
                                                    catalog['stream_id'])

            non_selected_properties = []
            if not select_all_fields:
                # get a list of all properties so that none are selected
                non_selected_properties = set(
                    schema.get('annotated-schema', {}).get('properties',
                                                           {}).keys())

                if select_default_fields and self.is_custom_report(
                        catalog['stream_name']):
                    non_selected_properties = non_selected_properties.difference(
                        self.custom_report_minimum_valid_field_selection(
                            catalog['stream_name']))

            connections.select_catalog_and_fields_via_metadata(
                conn_id, catalog, schema, [], non_selected_properties)
示例#3
0
    def _select_streams_and_fields(self, conn_id, catalogs,
                                   select_default_fields,
                                   select_pagination_fields):
        """Select all streams and all fields within streams"""

        for catalog in catalogs:

            schema_and_metadata = menagerie.get_annotated_schema(
                conn_id, catalog['stream_id'])
            metadata = schema_and_metadata['metadata']

            properties = set(md['breadcrumb'][-1] for md in metadata
                             if len(md['breadcrumb']) > 0
                             and md['breadcrumb'][0] == 'properties')

            # get a list of all properties so that none are selected
            if select_default_fields:
                non_selected_properties = properties.difference(
                    self.expected_default_fields()[catalog['stream_name']])
            elif select_pagination_fields:
                non_selected_properties = properties.difference(
                    self.expected_pagination_fields()[catalog['stream_name']])
            else:
                non_selected_properties = properties

            connections.select_catalog_and_fields_via_metadata(
                conn_id, catalog, schema_and_metadata, [],
                non_selected_properties)
示例#4
0
    def select_all_streams_and_fields(conn_id,
                                      catalogs,
                                      select_all_fields: bool = True):
        """Select all streams and all fields within streams"""
        for catalog in catalogs:
            schema = menagerie.get_annotated_schema(conn_id,
                                                    catalog['stream_id'])

            non_selected_properties = []
            if not select_all_fields:
                # get a list of all properties so that none are selected
                non_selected_properties = set(
                    schema.get('annotated-schema', {}).get('properties',
                                                           {}).keys())

            # HACK: This can be removed if the tap unwraps envelope
            # objects and declares replication keys as automatic
            if catalog[
                    "tap_stream_id"] == 'issues' and 'fields' in non_selected_properties:
                non_selected_properties.remove(
                    "fields")  # This contains replication key for issues
            elif catalog[
                    "tap_stream_id"] == "worklogs" and 'updated' in non_selected_properties:
                non_selected_properties.remove(
                    "updated")  # Replication key for worklogs

            connections.select_catalog_and_fields_via_metadata(
                conn_id, catalog, schema, [], non_selected_properties)
示例#5
0
    def select_found_catalogs(self,
                              conn_id,
                              catalogs,
                              only_streams=None,
                              deselect_all_fields: bool = False,
                              non_selected_props=[]):
        """Select all streams and all fields within streams"""
        for catalog in catalogs:
            if only_streams and catalog["stream_name"] not in only_streams:
                continue
            schema = menagerie.get_annotated_schema(conn_id,
                                                    catalog["stream_id"])

            non_selected_properties = non_selected_props if not deselect_all_fields else []
            if deselect_all_fields:
                # get a list of all properties so that none are selected
                non_selected_properties = schema.get("annotated-schema",
                                                     {}).get("properties", {})
                non_selected_properties = non_selected_properties.keys()
            additional_md = []

            connections.select_catalog_and_fields_via_metadata(
                conn_id,
                catalog,
                schema,
                additional_md=additional_md,
                non_selected_fields=non_selected_properties)
示例#6
0
    def select_all_streams_and_fields(self,
                                      conn_id,
                                      catalogs,
                                      select_all_fields: bool = True,
                                      exclude_streams=None):
        """Select all streams and all fields within streams"""

        for catalog in catalogs:
            if exclude_streams and catalog.get(
                    'stream_name') in exclude_streams:
                continue
            schema = menagerie.get_annotated_schema(conn_id,
                                                    catalog['stream_id'])
            non_selected_properties = []
            if not select_all_fields:
                # get a list of all properties so that none are selected
                non_selected_properties = schema.get('annotated-schema',
                                                     {}).get('properties', {})
                # remove properties that are automatic
                for prop in self.expected_automatic_fields().get(
                        catalog['stream_name'], []):
                    if prop in non_selected_properties:
                        del non_selected_properties[prop]
                non_selected_properties = non_selected_properties.keys()
            additional_md = []

            connections.select_catalog_and_fields_via_metadata(
                conn_id,
                catalog,
                schema,
                additional_md=additional_md,
                non_selected_fields=non_selected_properties)
示例#7
0
    def test_organizations_dynamic_fields(self):
        """
        Run tap in check mode and verify more than one page is retruned for dynamic fields. 
        """
        conn_id = connections.ensure_connection(self)

        # run and verify the tap in discovermode.
        found_catalog = self.run_and_verify_check_mode(conn_id)

        # Verify number of dynamic fields in organizations stream metadata
        # (Need enough dynamic fields for organizations)
        for catalog in found_catalog:
            if catalog['stream_name'] == "organizations":
                organization_fields_page_limit = 100

                schema_and_metadata = menagerie.get_annotated_schema(
                    conn_id, catalog['stream_id'])
                schema_fields = schema_and_metadata.get(
                    'annotated-schema').get('properties').keys()
                organizations_dynamic_fields = [
                    field for field in schema_fields
                    if field not in self.organizations_static_fields()
                ]

                #Verify count of dynamic fields is more than page limit for organization fields(Pagination)
                self.assertGreater(len(organizations_dynamic_fields),
                                   organization_fields_page_limit)
    def do_test(self, conn_id):
        # Select our catalogs
        our_catalogs = [c for c in self.found_catalogs if c.get('tap_stream_id') in self.expected_sync_streams()]
        for c in our_catalogs:
            c_annotated = menagerie.get_annotated_schema(conn_id, c['stream_id'])
            c_metadata = metadata.to_map(c_annotated['metadata'])
            connections.select_catalog_and_fields_via_metadata(conn_id, c, c_annotated, [], [])

        # Clear state before our run
        menagerie.set_state(conn_id, {})

        # Run a sync job using orchestrator
        sync_job_name = runner.run_sync_mode(self, conn_id)

        # Verify tap and target exit codes
        exit_status = menagerie.get_exit_status(conn_id, sync_job_name)
        menagerie.verify_sync_exit_status(self, exit_status, sync_job_name)

        # Verify actual rows were synced
        record_count_by_stream = runner.examine_target_output_file(self, conn_id, self.expected_sync_streams(), self.expected_pks())
        replicated_row_count =  reduce(lambda accum,c : accum + c, record_count_by_stream.values())
        self.assertGreater(replicated_row_count, 0, msg="failed to replicate any data: {}".format(record_count_by_stream))
        print("total replicated row count: {}".format(replicated_row_count))

        # Ensure all records have a value for PK(s)
        records = runner.get_records_from_target_output()
        for stream in self.expected_sync_streams():
            messages = records.get(stream,{}).get('messages',[])
            if stream in  ['tickets', 'groups', 'users']:
                self.assertGreater(len(messages), 100, msg="Stream {} has fewer than 100 records synced".format(stream))
            for m in messages:
                pk_set = self.expected_pks()[stream]
                for pk in pk_set:
                    self.assertIsNotNone(m.get('data', {}).get(pk), msg="Missing primary-key for message {}".format(m))
示例#9
0
    def set_replication_methods(self, conn_id, catalogs, replication_methods):

        replication_keys = self.expected_replication_keys()

        for catalog in catalogs:

            replication_method = replication_methods.get(
                catalog['stream_name'])

            if replication_method == self.INCREMENTAL:
                replication_key = list(
                    replication_keys.get(catalog['stream_name']))[0]
                replication_md = [{
                    "breadcrumb": [],
                    "metadata": {
                        'replication-key': replication_key,
                        "replication-method": replication_method,
                        "selected": True
                    }
                }]
            else:
                replication_md = [{
                    "breadcrumb": [],
                    "metadata": {
                        'replication-key': None,
                        "replication-method": "FULL_TABLE",
                        "selected": True
                    }
                }]

            connections.set_non_discoverable_metadata(
                conn_id, catalog,
                menagerie.get_annotated_schema(conn_id, catalog['stream_id']),
                replication_md)
def select_all_streams_and_fields(conn_id, catalogs):
    """Select all streams and all fields within streams"""
    for catalog in catalogs:
        schema = menagerie.get_annotated_schema(conn_id, catalog['stream_id'])

        connections.select_catalog_and_fields_via_metadata(
            conn_id, catalog, schema)
示例#11
0
    def test_primary_keys(self):
        """
        Verify that the configuration can be used to set primary key fields when
            * the primary key is an empty list
            * the primary key is a single field
            * the primary key is a composite of multiple fields
        """
        found_catalogs = menagerie.get_catalogs(S3TypesAndData.conn_id)
        all_catalogs = [x for x in found_catalogs]
        for catalog in all_catalogs:
            with self.subTest(c=catalog):
                expected_key_properties = \
                    S3TypesAndData.expected_pks()[catalog["stream_name"]]
                metadata_and_annotated_schema = menagerie.get_annotated_schema(
                    S3TypesAndData.conn_id, catalog['stream_id'])

                # verify that expected_key_properties show as automatic in metadata
                metadata = metadata_and_annotated_schema["metadata"]
                actual_key_properties = {
                    item.get("breadcrumb", ["", ""])[1]
                    for item in metadata
                    if item.get("metadata").get("inclusion") == "automatic"
                }
                self.assertEqual(actual_key_properties,
                                 expected_key_properties)
示例#12
0
    def run_test(self, only_automatic_fields=False):
        expected_streams = self.streams_to_select()
        conn_id = connections.ensure_connection(self)
        runner.run_check_mode(self, conn_id)

        expected_stream_fields = dict()

        found_catalogs = menagerie.get_catalogs(conn_id)
        for catalog in found_catalogs:
            stream_name = catalog['stream_name']
            catalog_entry = menagerie.get_annotated_schema(conn_id, catalog['stream_id'])
            if not stream_name in expected_streams:
                continue
            # select catalog fields
            self.select_found_catalogs(conn_id,
                                       [catalog],
                                       only_streams=[stream_name],
                                       deselect_all_fields=True if only_automatic_fields else False,
                                       non_selected_props=[] if only_automatic_fields else self.non_selected_fields[stream_name])
            # add expected fields for assertion
            fields_from_field_level_md = [md_entry['breadcrumb'][1] for md_entry in catalog_entry['metadata']
                                          if md_entry['breadcrumb'] != []]
            if only_automatic_fields:
                expected_stream_fields[stream_name] = self.expected_primary_keys()[stream_name] | self.expected_replication_keys()[stream_name]
            else:
                expected_stream_fields[stream_name] = set(fields_from_field_level_md) - set(self.non_selected_fields[stream_name])

        self.run_and_verify_sync(conn_id)
        synced_records = runner.get_records_from_target_output()

        for stream in expected_streams:
            with self.subTest(stream=stream):

                expected_primary_keys = self.expected_primary_keys()[stream]

                # get expected keys
                expected_keys = expected_stream_fields[stream]

                # collect all actual values
                messages = synced_records.get(stream)

                # collect actual synced fields
                actual_keys = [set(message['data'].keys()) for message in messages['messages']
                                   if message['action'] == 'upsert'][0]

                fields = self.fields_to_remove.get(stream) or []
                expected_keys = expected_keys - set(fields)

                # verify expected and actual fields
                self.assertEqual(expected_keys, actual_keys,
                                 msg='Selected keys in catalog is not as expected')

                # Verify we did not duplicate any records across pages
                records_pks_set = {tuple([message.get('data').get(primary_key) for primary_key in expected_primary_keys])
                                   for message in messages.get('messages')}
                records_pks_list = [tuple([message.get('data').get(primary_key) for primary_key in expected_primary_keys])
                                    for message in messages.get('messages')]
                self.assertCountEqual(records_pks_set, records_pks_list,
                                      msg="We have duplicate records for {}".format(stream))
示例#13
0
    def test_run(self):
        conn_id = connections.ensure_connection(self)

        #run in check mode
        check_job_name = runner.run_check_mode(self, conn_id)

        #verify check  exit codes
        exit_status = menagerie.get_exit_status(conn_id, check_job_name)
        menagerie.verify_check_exit_status(self, exit_status, check_job_name)

        found_catalogs = menagerie.get_catalogs(conn_id)
        self.assertGreater(len(found_catalogs), 0, msg="unable to locate schemas for connection {}".format(conn_id))

        found_catalog_names = set(map(lambda c: c['tap_stream_id'], found_catalogs))

        diff = self.expected_check_streams().symmetric_difference( found_catalog_names )
        self.assertEqual(len(diff), 0, msg="discovered schemas do not match: {}".format(diff))
        print("discovered schemas are OK")

        #select all catalogs

        for c in found_catalogs:
            catalog_entry = menagerie.get_annotated_schema(conn_id, c['stream_id'])
            if c['stream_name'] in self.expected_sync_streams().keys():
                stream = c['stream_name']
                pks = self.expected_sync_streams()[stream]

                for pk in pks:
                    mdata = next((m for m in catalog_entry['metadata']
                                  if len(m['breadcrumb']) == 2 and m['breadcrumb'][1] == pk), None)
                    print("Validating inclusion on {}: {}".format(c['stream_name'], mdata))
                    self.assertTrue(mdata and mdata['metadata']['inclusion'] == 'automatic')

                connections.select_catalog_and_fields_via_metadata(conn_id, c, catalog_entry)

        #clear state
        menagerie.set_state(conn_id, {})

        sync_job_name = runner.run_sync_mode(self, conn_id)

        #verify tap and target exit codes
        exit_status = menagerie.get_exit_status(conn_id, sync_job_name)
        menagerie.verify_sync_exit_status(self, exit_status, sync_job_name)

        first_record_count_by_stream = runner.examine_target_output_file(self, conn_id, set(self.expected_sync_streams().keys()), self.expected_sync_streams())
        replicated_row_count =  reduce(lambda accum,c : accum + c, first_record_count_by_stream.values())
        self.assertGreater(replicated_row_count, 0, msg="failed to replicate any data: {}".format(first_record_count_by_stream))
        print("total replicated row count: {}".format(replicated_row_count))

        # Verify that automatic fields are all emitted with records
        synced_records = runner.get_records_from_target_output()
        for stream_name, data in synced_records.items():
            record_messages = [set(row['data'].keys()) for row in data['messages']]
            self.assertGreater(len(record_messages), 0, msg="stream {} did not sync any records.".format(stream_name))
            for record_keys in record_messages:
                self.assertEqual(self.expected_sync_streams().get(stream_name, set()) - record_keys, set())
示例#14
0
    def perform_and_verify_table_and_field_selection(
            self,  # TODO clean this up and select_all_streams_and_fields
            conn_id,
            test_catalogs,
            select_all_fields=True,
            select_default_fields=False):
        """
        Perform table and field selection based off of the streams to select
        set and field selection parameters.

        Verify this results in the expected streams selected and all or no
        fields selected for those streams.
        """

        # Select all available fields or select no fields from all testable streams
        self.select_all_streams_and_fields(
            conn_id=conn_id,
            catalogs=test_catalogs,
            select_all_fields=select_all_fields,
            select_default_fields=select_default_fields)

        catalogs = menagerie.get_catalogs(conn_id)

        # Ensure our selection affects the catalog
        expected_selected = [tc.get('stream_name') for tc in test_catalogs]
        for cat in catalogs:
            catalog_entry = menagerie.get_annotated_schema(
                conn_id, cat['stream_id'])

            # Verify all testable streams are selected
            selected = catalog_entry.get('annotated-schema').get('selected')
            print("Validating selection on {}: {}".format(
                cat['stream_name'], selected))
            if cat['stream_name'] not in expected_selected:
                self.assertFalse(selected,
                                 msg="Stream selected, but not testable.")
                continue  # Skip remaining assertions if we aren't selecting this stream
            self.assertTrue(selected, msg="Stream not selected.")

            if select_all_fields:
                # Verify all fields within each selected stream are selected
                for field, field_props in catalog_entry.get(
                        'annotated-schema').get('properties').items():
                    field_selected = field_props.get('selected')
                    print("\tValidating selection on {}.{}: {}".format(
                        cat['stream_name'], field, field_selected))
                    self.assertTrue(field_selected, msg="Field not selected.")
            else:
                if not self.is_custom_report(cat['stream_name']):
                    # Verify only automatic fields are selected
                    expected_automatic_fields = self.expected_automatic_fields(
                    ).get(cat['stream_name'])
                    selected_fields = self.get_selected_fields_from_metadata(
                        catalog_entry['metadata'])
                    self.assertEqual(expected_automatic_fields,
                                     selected_fields)
示例#15
0
    def test_catalog_without_properties(self):

        self.setUpTestEnvironment()

        runner.run_check_job_and_check_status(self)

        found_catalogs = menagerie.get_catalogs(self.conn_id)
        self.assertEqual(len(found_catalogs), 1,
                         msg="unable to locate schemas for connection {}".format(self.conn_id))

        found_catalog_names = set(
            map(lambda c: c['tap_stream_id'], found_catalogs))
        subset = self.expected_streams().issubset(found_catalog_names)
        self.assertTrue(
            subset, msg="Expected check streams are not subset of discovered catalog")

        our_catalogs = [c for c in found_catalogs if c.get(
            'tap_stream_id') in self.expected_streams()]

        # Select our catalogs
        for c in our_catalogs:
            c_annotated = menagerie.get_annotated_schema(
                self.conn_id, c['stream_id'])
            connections.select_catalog_and_fields_via_metadata(
                self.conn_id, c, c_annotated, [], [])

        # Clear state before our run
        menagerie.set_state(self.conn_id, {})

        # Run a sync job using orchestrator
        sync_job_name = runner.run_sync_mode(self, self.conn_id)

        # Verify tap and target exit codes
        exit_status = menagerie.get_exit_status(self.conn_id, sync_job_name)
        menagerie.verify_sync_exit_status(self, exit_status, sync_job_name)

        synced_records = runner.get_records_from_target_output()
        upsert_messages = [m for m in synced_records.get(
            'csv_with_empty_lines').get('messages') if m['action'] == 'upsert']

        records = [message.get('data') for message in upsert_messages]

        #Empty line should be ignored in emitted records.

        expected_records = [
            {'id': 1, 'name': 'John', '_sdc_extra': [{'name': 'carl'}], '_sdc_source_bucket': 'com-stitchdata-prod-circleci-assets',
                '_sdc_source_file': 'tap_tester/test_csv_with_empty_lines.csv', '_sdc_source_lineno': 2},
            {'id': 2, 'name': 'Bob', '_sdc_source_bucket': 'com-stitchdata-prod-circleci-assets',
                '_sdc_source_file': 'tap_tester/test_csv_with_empty_lines.csv', '_sdc_source_lineno': 3},
            {'id': 3, '_sdc_source_bucket': 'com-stitchdata-prod-circleci-assets',
                '_sdc_source_file': 'tap_tester/test_csv_with_empty_lines.csv', '_sdc_source_lineno': 4},
            {'id': 4, 'name': 'Alice', '_sdc_extra': [{'no_headers': ['Ben', '5']}, {
                'name': 'Barak'}], '_sdc_source_bucket': 'com-stitchdata-prod-circleci-assets', '_sdc_source_file': 'tap_tester/test_csv_with_empty_lines.csv', '_sdc_source_lineno': 5}
        ]

        self.assertListEqual(expected_records, records)
    def test_run(self):
        conn_id = connections.ensure_connection(self)

        found_catalogs = self.run_and_verify_check_mode(conn_id)

        #select all catalogs
        for catalog in found_catalogs:
            connections.select_catalog_and_fields_via_metadata(conn_id, catalog, menagerie.get_annotated_schema(conn_id, catalog['stream_id']))

        future_time = "2050-01-01T00:00:00.000000Z"

        #clear state
        future_bookmarks = {"currently_syncing" : None,
                            "bookmarks":  {"contacts" : {"offset" : {},
                                                         "versionTimestamp" :  future_time},
                                           "subscription_changes" : {"startTimestamp" : future_time,
                                                                     "offset" :  {}},
                                           "campaigns" :  {"offset" : {}},
                                           "forms" : {"updatedAt" :  future_time},
                                           "deals" :  {"offset" :  {},
                                                       "hs_lastmodifieddate" :  future_time},
                                           "workflows" :  {"updatedAt" : future_time},
                                           "owners" :  {"updatedAt" :  future_time},
                                           "contact_lists" :  {"updatedAt" :  future_time,
                                                               "offset" :  {}},
                                           "email_events" :  {"startTimestamp" : future_time,
                                                              "offset" : {}},
                                           "companies" :  {"offset" : {},
                                                           "hs_lastmodifieddate" :  future_time},
                                           "engagements" :  {"lastUpdated" :  future_time,
                                                             "offset" : {}}}}

        menagerie.set_state(conn_id, future_bookmarks)

        record_count_by_stream = self.run_and_verify_sync(conn_id)

        #because the bookmarks were set into the future, we should NOT actually replicate any data.
        #minus campaigns, and deal_pipelines because those endpoints do NOT suppport bookmarks
        streams_with_bookmarks = self.expected_sync_streams()
        streams_with_bookmarks.remove('campaigns')
        streams_with_bookmarks.remove('deal_pipelines')
        bad_streams = streams_with_bookmarks.intersection(record_count_by_stream.keys())
        self.assertEqual(len(bad_streams), 0, msg="still pulled down records from {} despite future bookmarks".format(bad_streams))


        state = menagerie.get_state(conn_id)

        # NB: Companies and engagements won't set a bookmark in the future.
        state["bookmarks"].pop("companies")
        state["bookmarks"].pop("engagements")
        future_bookmarks["bookmarks"].pop("companies")
        future_bookmarks["bookmarks"].pop("engagements")

        self.assertEqual(state, future_bookmarks, msg="state should not have been modified because we didn't replicate any data")
        bookmarks = state.get('bookmarks')
        bookmark_streams = set(state.get('bookmarks').keys())
    def test_run(self):
        conn_id = connections.ensure_connection(self)

        # run in check mode
        check_job_name = runner.run_check_mode(self, conn_id)

        # verify check exit codes
        exit_status = menagerie.get_exit_status(conn_id, check_job_name)
        menagerie.verify_check_exit_status(self, exit_status, check_job_name)

        found_catalogs = menagerie.get_catalogs(conn_id)
        self.assertGreater(len(found_catalogs), 0, msg="unable to locate schemas for connection {}".format(conn_id))

        found_catalog_names = set(map(lambda c: c['tap_stream_id'], found_catalogs))

        diff = self.expected_check_streams().symmetric_difference( found_catalog_names )
        self.assertEqual(len(diff), 0, msg="discovered schemas do not match: {}".format(diff))
        print("discovered schemas are kosher")

        all_excluded_fields = {}
        # select all catalogs
        for c in found_catalogs:
            if c['stream_name'] == 'ads':
                continue

            discovered_schema = menagerie.get_annotated_schema(conn_id, c['stream_id'])['annotated-schema']
            all_excluded_fields[c['stream_name']] = list(set(discovered_schema.keys()) - self.expected_automatic_fields().get(c['stream_name'], set()))[:5]
            connections.select_catalog_and_fields_via_metadata(
                conn_id,
                c,
                discovered_schema,
                non_selected_fields=all_excluded_fields[c['stream_name']])

        # clear state
        menagerie.set_state(conn_id, {})

        sync_job_name = runner.run_sync_mode(self, conn_id)

        # verify tap and target exit codes
        exit_status = menagerie.get_exit_status(conn_id, sync_job_name)
        menagerie.verify_sync_exit_status(self, exit_status, sync_job_name)

        # This should be validating the the PKs are written in each record
        record_count_by_stream = runner.examine_target_output_file(self, conn_id, self.expected_sync_streams(), self.expected_pks())
        replicated_row_count =  reduce(lambda accum,c : accum + c, record_count_by_stream.values())
        self.assertGreater(replicated_row_count, 0, msg="failed to replicate any data: {}".format(record_count_by_stream))
        print("total replicated row count: {}".format(replicated_row_count))

        synced_records = runner.get_records_from_target_output()
        self.assertTrue('ads' not in synced_records.keys())
        for stream_name, data in synced_records.items():
            record_messages = [set(row['data'].keys()) for row in data['messages']]
            for record_keys in record_messages:
                # The intersection should be empty
                self.assertFalse(record_keys.intersection(all_excluded_fields[stream_name]))
示例#18
0
    def test_run(self):

        conn_id = connections.ensure_connection(self, payload_hook=None)

        # Run the tap in check mode
        check_job_name = runner.run_check_mode(self, conn_id)

        # Verify the check's exit status
        exit_status = menagerie.get_exit_status(conn_id, check_job_name)
        menagerie.verify_check_exit_status(self, exit_status, check_job_name)

        # Verify that there are catalogs found
        found_catalogs = menagerie.get_catalogs(conn_id)
        self.assertGreater(
            len(found_catalogs),
            0,
            msg="unable to locate schemas for connection {}".format(conn_id))

        found_catalog_names = set(
            map(lambda c: c['tap_stream_id'], found_catalogs))
        subset = self.expected_check_streams().issubset(found_catalog_names)
        self.assertTrue(
            subset,
            msg="Expected check streams are not subset of discovered catalog")
        #
        # # Select some catalogs
        our_catalogs = [
            c for c in found_catalogs
            if c.get('tap_stream_id') in self.expected_sync_streams()
        ]
        for catalog in our_catalogs:
            schema = menagerie.get_annotated_schema(conn_id,
                                                    catalog['stream_id'])
            connections.select_catalog_and_fields_via_metadata(
                conn_id, catalog, schema, [], [])

        # # Verify that all streams sync at least one row for initial sync
        # # This test is also verifying access token expiration handling. If test fails with
        # # authentication error, refresh token was not replaced after expiring.
        menagerie.set_state(conn_id, {})
        sync_job_name = runner.run_sync_mode(self, conn_id)

        # # Verify tap and target exit codes
        exit_status = menagerie.get_exit_status(conn_id, sync_job_name)
        menagerie.verify_sync_exit_status(self, exit_status, sync_job_name)
        record_count_by_stream = runner.examine_target_output_file(
            self, conn_id, self.expected_sync_streams(), self.expected_pks())
        zero_count_streams = {
            k
            for k, v in record_count_by_stream.items() if v == 0
        }
        self.assertFalse(
            zero_count_streams,
            msg="The following streams did not sync any rows {}".format(
                zero_count_streams))
示例#19
0
    def test_run(self):
        conn_id = connections.ensure_connection(self)

        found_catalogs = self.run_and_verify_check_mode(conn_id)

        # Select only the expected streams tables
        expected_streams = self.expected_streams()
        catalog_entries = [ce for ce in found_catalogs if ce['tap_stream_id'] in expected_streams]
        self.select_all_streams_and_fields(conn_id, catalog_entries, select_all_fields=False)

        # Verify our selection worked as expected
        catalogs_selection = menagerie.get_catalogs(conn_id)
        for cat in catalogs_selection:
            catalog_entry = menagerie.get_annotated_schema(conn_id, cat['stream_id'])

            # Verify the expected stream tables are selected
            selected = catalog_entry.get('annotated-schema').get('selected')
            print("Validating selection on {}: {}".format(cat['stream_name'], selected))
            if cat['stream_name'] not in expected_streams:
                self.assertFalse(selected, msg="Stream selected, but not testable.")
                continue # Skip remaining assertions if we aren't selecting this stream
            self.assertTrue(selected, msg="Stream not selected.")

            # Verify only automatic fields are selected
            expected_automatic_fields = self.expected_automatic_fields().get(cat['tap_stream_id'])
            selected_fields = self.get_selected_fields_from_metadata(catalog_entry['metadata'])
            self.assertEqual(expected_automatic_fields, selected_fields, msg='for stream {}, expected: {} actual: {}'.format(cat['stream_name'], expected_automatic_fields, selected_fields))

        # Run a sync job using orchestrator
        sync_record_count = self.run_and_verify_sync(conn_id)
        synced_records = runner.get_records_from_target_output()

        # Assert the records for each stream
        for stream in self.expected_streams():
            with self.subTest(stream=stream):
                data = synced_records.get(stream)

                if not data:
                    print('WARNING: Add data for {}'.format(stream))
                    continue

                record_messages_keys = [set(row['data'].keys()) for row in data['messages']]
                expected_keys = self.expected_automatic_fields().get(stream)

                # Verify that only the automatic fields are sent to the target
                for actual_keys in record_messages_keys:
                    self.assertEqual(
                        actual_keys.symmetric_difference(expected_keys), set(),
                        msg="Expected automatic fields and nothing else.")

                # Verify the sync meets or exceeds the default record count
                record_count = sync_record_count.get(stream, 0)
                self.assertLessEqual(1, record_count)
    def test_run(self):
        conn_id = connections.ensure_connection(self)

        #run in check mode
        check_job_name = runner.run_check_mode(self, conn_id)

        #verify check exit codes
        exit_status = menagerie.get_exit_status(conn_id, check_job_name)
        menagerie.verify_check_exit_status(self, exit_status, check_job_name)

        found_catalogs = menagerie.get_catalogs(conn_id)
        self.assertGreater(
            len(found_catalogs),
            0,
            msg="unable to locate schemas for connection {}".format(conn_id))

        found_catalog_names = set(
            map(lambda c: c['tap_stream_id'], found_catalogs))

        diff = self.expected_check_streams().symmetric_difference(
            found_catalog_names)
        self.assertEqual(
            len(diff),
            0,
            msg="discovered schemas do not match: {}".format(diff))
        print("discovered schemas are kosher")

        # select all catalogs
        for c in found_catalogs:
            catalog_entry = menagerie.get_annotated_schema(
                conn_id, c['stream_id'])
            connections.select_catalog_via_metadata(conn_id, c, catalog_entry)

        # clear state
        menagerie.set_state(conn_id, {})

        sync_job_name = runner.run_sync_mode(self, conn_id)

        # verify tap and target exit codes
        exit_status = menagerie.get_exit_status(conn_id, sync_job_name)
        menagerie.verify_sync_exit_status(self, exit_status, sync_job_name)

        # This should be validating the the PKs are written in each record

        record_count_by_stream = runner.examine_target_output_file(
            self, conn_id, self.expected_sync_streams(), self.expected_pks())
        replicated_row_count = reduce(lambda accum, c: accum + c,
                                      record_count_by_stream.values())
        self.assertGreater(replicated_row_count,
                           0,
                           msg="failed to replicate any data: {}".format(
                               record_count_by_stream))
        print("total replicated row count: {}".format(replicated_row_count))
示例#21
0
    def test_run(self):
        conn_id = connections.ensure_connection(self)

        # Run the tap in check mode
        check_job_name = runner.run_check_mode(self, conn_id)

        # Verify the check's exit status
        exit_status = menagerie.get_exit_status(conn_id, check_job_name)
        menagerie.verify_check_exit_status(self, exit_status, check_job_name)

        # Verify that there are catalogs found
        found_catalogs = menagerie.get_catalogs(conn_id)
        self.assertGreater(
            len(found_catalogs),
            0,
            msg="unable to locate schemas for connection {}".format(conn_id))

        found_catalog_names = set(
            map(lambda c: c['tap_stream_id'], found_catalogs))
        subset = self.expected_check_streams().issubset(found_catalog_names)
        self.assertTrue(
            subset,
            msg="Expected check streams are not subset of discovered catalog")

        # Select some catalogs
        our_catalogs = [
            c for c in found_catalogs
            if c.get('tap_stream_id') in self.expected_sync_streams()
        ]
        for catalog in our_catalogs:
            schema = menagerie.get_annotated_schema(conn_id,
                                                    catalog['stream_id'])
            connections.select_catalog_and_fields_via_metadata(
                conn_id, catalog, schema)

        # Clear State and run sync
        menagerie.set_state(conn_id, {})
        sync_job_name = runner.run_sync_mode(self, conn_id)

        # Verify tap and target exit codes
        exit_status = menagerie.get_exit_status(conn_id, sync_job_name)
        menagerie.verify_sync_exit_status(self, exit_status, sync_job_name)

        # Verify rows were synced
        record_count_by_stream = runner.examine_target_output_file(
            self, conn_id, self.expected_sync_streams(), self.expected_pks())
        replicated_row_count = reduce(lambda accum, c: accum + c,
                                      record_count_by_stream.values())
        self.assertGreater(replicated_row_count,
                           0,
                           msg="failed to replicate any data: {}".format(
                               record_count_by_stream))
        print("total replicated row count: {}".format(replicated_row_count))
示例#22
0
    def perform_and_verify_table_and_field_selection(self,
                                                     conn_id,
                                                     test_catalogs,
                                                     select_all_fields=True):
        """
        Perform table and field selection based off of the streams to select
        set and field selection parameters.

        Verify this results in the expected streams selected and all or no
        fields selected for those streams.
        """

        # Select all available fields or select no fields from all testable streams
        self.select_all_streams_and_fields(
            conn_id=conn_id, catalogs=test_catalogs, select_all_fields=select_all_fields
        )

        catalogs = menagerie.get_catalogs(conn_id)

        # Ensure our selection affects the catalog
        expected_selected = [tc.get('stream_name') for tc in test_catalogs]
        for cat in catalogs:
            catalog_entry = menagerie.get_annotated_schema(conn_id, cat['stream_id'])

            # Verify all testable streams are selected
            top_level_md = [md_entry for md_entry in catalog_entry['metadata']
                            if md_entry['breadcrumb'] == []]
            selected = top_level_md[0]['metadata'].get('selected')
            print("Validating selection on {}: {}".format(cat['stream_name'], selected))
            if cat['stream_name'] not in expected_selected:
                self.assertFalse(selected, msg="Stream selected, but not testable.")
                continue # Skip remaining assertions if we aren't selecting this stream
            self.assertTrue(selected, msg="Stream not selected.")

            if select_all_fields:
                # Verify all fields within each selected stream are selected
                field_level_md = [md_entry for md_entry in catalog_entry['metadata']
                                  if md_entry['breadcrumb'] != []]
                for field_md in field_level_md:
                    field = field_md['breadcrumb'][1]
                    field_selected = field_md['metadata'].get('selected')
                    print("\tValidating selection on {}.{}: {}".format(
                        cat['stream_name'], field, field_selected))
                    self.assertTrue(field_selected, msg="Field not selected.")
            else:
                # Verify only automatic fields are selected
                expected_automatic_fields = self.expected_automatic_fields().get(cat['stream_name'])
                selected_fields = self.get_selected_fields_from_metadata(catalog_entry['metadata'])
                # BUG TDL-14241 | Replication keys are not automatic
                if cat['stream_name'] == "file_metadata":
                    expected_automatic_fields.remove('modifiedTime')
                self.assertEqual(expected_automatic_fields, selected_fields)
示例#23
0
    def perform_and_verify_adjusted_selection(self, conn_id, test_catalogs,
                                              select_all_fields,
                                              specific_fields):
        """
        Perform table and field selection based off of the streams to select
        set and field selection parameters.

        Verify this results in the expected streams selected and all or no
        fields selected for those streams.
        """

        # Select specifc fields from all testable streams
        self.select_specific_fields(conn_id=conn_id,
                                    catalogs=test_catalogs,
                                    select_all_fields=select_all_fields,
                                    specific_fields=specific_fields)

        catalogs = menagerie.get_catalogs(conn_id)

        # Ensure our selection affects the catalog
        expected_selected = [tc.get('tap_stream_id') for tc in test_catalogs]
        for cat in catalogs:
            with self.subTest(cat=cat):
                catalog_entry = menagerie.get_annotated_schema(
                    conn_id, cat['stream_id'])

                # Verify intended streams are selected
                selected = catalog_entry.get('annotated-schema').get(
                    'selected')
                print("Validating selection on {}: {}".format(
                    cat['tap_stream_id'], selected))
                if cat['stream_name'] not in expected_selected:
                    continue  # Skip remaining assertions if we aren't selecting this stream

                self.assertTrue(selected, msg="Stream not selected.")

                if select_all_fields:
                    # Verify all fields within each selected stream are selected
                    for field, field_props in catalog_entry.get(
                            'annotated-schema').get('properties').items():
                        field_selected = field_props.get('selected')
                        print("\tValidating selection on {}.{}: {}".format(
                            cat['stream_name'], field, field_selected))
                        self.assertTrue(field_selected,
                                        msg="Field not selected.")
                else:
                    for field, field_props in catalog_entry.get(
                            'annotated-schema').get('properties').items():
                        field_selected = field_props.get('selected')
                        if field_selected:
                            print("\tValidating selection on {}.{}: {}".format(
                                cat['stream_name'], field, field_selected))
示例#24
0
    def test_data_type_sampling(self):
        """
        Verify that each data type can be sampled and determined correctly.

        A file for stream `test_data_types_no_coercion` was setup which
        has one column for each test. Tests include each data type:
            * integer
            * number
            * date-time
            * string

        integers are tested for boundary conditions of signed and unsigned big-ints,
        strings ore tested for length including a null string and 65536 chars.
        numbers are tested for float and double representations at the borders
            which are exponents for extremely large and small positive and
            negative numbers plus zero.  numbers are also tested for precision
        date-times are tested at the borders of allowed python date-times in
            multiple types of formats including just dates, just times, date-times
            with timezone and date-times without timezone.

        The test below uses subtests so that each data-type is tested and
        reported on individually
        """

        found_catalogs = menagerie.get_catalogs(S3TypesAndData.conn_id)

        # only testing the data types stream for now, may want to test all of them
        # or add more tests for different things for other catalogs.
        data_type_catalogs = [
            x for x in found_catalogs
            if x["stream_name"] in ("test_data_types_no_coercion",
                                    "test_switching_data_types")
        ]

        for data_type in DataTypes:
            for catalog in data_type_catalogs:
                with self.subTest(dt=(data_type, catalog)):

                    # verify each data type is sampled correctly in the annotated-schema
                    expected_properties = S3TypesAndData.expected_properties_for_data_types(
                        data_type, catalog['stream_name'])

                    metadata_and_annotated_schema = menagerie.get_annotated_schema(
                        S3TypesAndData.conn_id, catalog['stream_id'])
                    properties = metadata_and_annotated_schema[
                        "annotated-schema"]["properties"]
                    actual_properties = {
                        k
                        for k, v in properties.items()
                        if v.get(data_type.value[0]) == data_type.value[1]
                    }
                    self.assertEqual(expected_properties, actual_properties)
示例#25
0
    def select_all_streams_and_fields(conn_id, catalogs, select_all_fields: bool = True):
        """Select all streams and all fields within streams"""
        for catalog in catalogs:
            schema = menagerie.get_annotated_schema(conn_id, catalog['stream_id'])

            non_selected_properties = []
            if not select_all_fields:
                # get a list of all properties so that none are selected
                non_selected_properties = schema.get('annotated-schema', {}).get(
                    'properties', {}).keys()

            connections.select_catalog_and_fields_via_metadata(
                conn_id, catalog, schema, [], non_selected_properties)
示例#26
0
    def test_run(self):
        conn_id = self.create_connection()

        # Select our catalogs
        our_catalogs = [
            c for c in self.found_catalogs
            if c.get('tap_stream_id') in self.expected_sync_streams()
        ]
        for c in our_catalogs:
            c_annotated = menagerie.get_annotated_schema(
                conn_id, c['stream_id'])
            c_metadata = metadata.to_map(c_annotated['metadata'])
            connections.select_catalog_and_fields_via_metadata(
                conn_id, c, c_annotated, [], [])

        # Clear state before our run
        menagerie.set_state(conn_id, {})

        # Run a sync job using orchestrator
        sync_job_name = runner.run_sync_mode(self, conn_id)

        # Verify tap and target exit codes
        exit_status = menagerie.get_exit_status(conn_id, sync_job_name)
        menagerie.verify_sync_exit_status(self, exit_status, sync_job_name)

        # Verify actual rows were synced
        record_count_by_stream = runner.examine_target_output_file(
            self, conn_id, self.expected_sync_streams(), self.expected_pks())
        replicated_row_count = reduce(lambda accum, c: accum + c,
                                      record_count_by_stream.values())
        self.assertGreater(replicated_row_count,
                           0,
                           msg="failed to replicate any data: {}".format(
                               record_count_by_stream))
        print("total replicated row count: {}".format(replicated_row_count))

        # Ensure all records have a value for PK(s)
        records = runner.get_records_from_target_output()
        for stream in self.expected_sync_streams():
            messages = records.get(stream).get('messages')
            for m in messages:
                pk_set = self.expected_pks()[stream]
                for pk in pk_set:
                    self.assertIsNotNone(m.get('data', {}).get(pk),
                                         msg="oh no! {}".format(m))

        bookmarks = menagerie.get_state(conn_id)['bookmarks']

        self.assertTrue('orders' in bookmarks)
    def test_run(self):
        conn_id = connections.ensure_connection(self)

        #run in check mode
        check_job_name = runner.run_check_mode(self, conn_id)

        #verify check  exit codes
        exit_status = menagerie.get_exit_status(conn_id, check_job_name)
        menagerie.verify_check_exit_status(self, exit_status, check_job_name)

        found_catalogs = menagerie.get_catalogs(conn_id)
        self.assertGreater(len(found_catalogs), 0, msg="unable to locate schemas for connection {}".format(conn_id))

        found_catalog_names = set(map(lambda c: c['tap_stream_id'], found_catalogs))

        diff = self.expected_check_streams().symmetric_difference( found_catalog_names )
        self.assertEqual(len(diff), 0, msg="discovered schemas do not match: {}".format(diff))
        print("discovered schemas are kosher")

        #select all catalogs
        #selected_catalogs = list(map(lambda catalog: self.perform_field_selection(conn_id, catalog), found_catalogs))
        #menagerie.post_annotated_catalogs(conn_id, selected_catalogs)

        for c in found_catalogs:
            connections.select_catalog_and_fields_via_metadata(conn_id, c,
                                                               menagerie.get_annotated_schema(conn_id, c['stream_id']))

        #clear state
        menagerie.set_state(conn_id, {})

        sync_job_name = runner.run_sync_mode(self, conn_id)

        #verify tap and target exit codes
        exit_status = menagerie.get_exit_status(conn_id, sync_job_name)
        menagerie.verify_sync_exit_status(self, exit_status, sync_job_name)

        record_count_by_stream = runner.examine_target_output_file(self, conn_id, self.expected_sync_streams(), self.expected_pks())
        replicated_row_count =  reduce(lambda accum,c : accum + c, record_count_by_stream.values())
        self.assertGreater(replicated_row_count, 0, msg="failed to replicate any data: {}".format(record_count_by_stream))
        print("total replicated row count: {}".format(replicated_row_count))

        # bookmarks for the 4 streams should be 2015-03-16
        states = menagerie.get_state(conn_id)["bookmarks"]
        end_date = self.get_properties()["end_date"].split()[0]
        for k, v in states.items():
            if "insights" in k:
                bm_date = v.get("date_start")
                self.assertEqual(end_date, bm_date)
        print("bookmarks match end_date of {}".format(end_date))
示例#28
0
    def select_specific_fields(conn_id, catalogs, select_all_fields: bool = True, specific_fields: dict = {}):
        """Select all streams and all fields within streams"""
        for catalog in catalogs:
            schema = menagerie.get_annotated_schema(conn_id, catalog['stream_id'])

            non_selected_properties = []
            if not select_all_fields:
                # get a list of all properties and remove measuer fields
                non_selected_properties = set(schema.get('annotated-schema', {}).get(
                    'properties', {}).keys())
                spec_fields = specific_fields.get(catalog['stream_name'], set())
                non_selected_properties_adjusted = non_selected_properties.difference(spec_fields)

            connections.select_catalog_and_fields_via_metadata(
                conn_id, catalog, schema, [], non_selected_properties_adjusted)
示例#29
0
    def do_test(self, conn_id):
        # Get the Streams for Organizations and Users
        streams = [c for c in self.found_catalogs if c['stream_name'] in ['organizations', 'users']]

        # Create an array of arrays where the first element is the word minus the last letter ie: "organization"
        # and the second element is the annotated schema
        schemas = [(s['stream_name'][:-1], menagerie.get_annotated_schema(conn_id, s['stream_id'])) for s in streams]

        # Loop over them
        for schema in schemas:
            properties = schema[1]['annotated-schema']['properties']
            # Ensure that "organization_fields" or "user_fields" are objects in the annotated schema
            # with their own set of properties
            self.assertIsNotNone(properties.get('{}_fields'.format(schema[0]), {}).get('properties'),
                                 msg='{}_fields not present in schema!'.format(schema[0]))
示例#30
0
    def select_found_catalogs(self, found_catalogs):
        # selected = [menagerie.select_catalog(self.conn_id, c) for c in found_catalogs]
        # menagerie.post_annotated_catalogs(self.conn_id, selected)
        for catalog in found_catalogs:
            schema = menagerie.get_annotated_schema(self.conn_id,
                                                    catalog['stream_id'])
            non_selected_properties = []
            additional_md = []

            connections.select_catalog_and_fields_via_metadata(
                self.conn_id,
                catalog,
                schema,
                additional_md=additional_md,
                non_selected_fields=non_selected_properties)