def test_column_name_change(self):
        """Tests correct renaming of snowflake columns after source change"""
        tap_lines_before_column_name_change = test_utils.get_test_tap_lines('messages-with-three-streams.json')
        tap_lines_after_column_name_change = test_utils.get_test_tap_lines(
            'messages-with-three-streams-modified-column.json')

        # Load with default settings
        self.persist_lines_with_cache(tap_lines_before_column_name_change)
        self.persist_lines_with_cache(tap_lines_after_column_name_change)

        # Get loaded rows from tables
        snowflake = DbSync(self.config)
        target_schema = self.config.get('default_target_schema', '')
        table_one = snowflake.query("SELECT * FROM {}.test_table_one ORDER BY c_pk".format(target_schema))
        table_two = snowflake.query("SELECT * FROM {}.test_table_two ORDER BY c_pk".format(target_schema))
        table_three = snowflake.query("SELECT * FROM {}.test_table_three ORDER BY c_pk".format(target_schema))

        # Get the previous column name from information schema in test_table_two
        previous_column_name = snowflake.query("""
            SELECT column_name
              FROM information_schema.columns
             WHERE table_catalog = '{}'
               AND table_schema = '{}'
               AND table_name = 'TEST_TABLE_TWO'
               AND ordinal_position = 1
            """.format(
            self.config.get('dbname', '').upper(),
            target_schema.upper()))[0]["COLUMN_NAME"]

        # Table one should have no changes
        self.assertEqual(
            table_one,
            [{'C_INT': 1, 'C_PK': 1, 'C_VARCHAR': '1'}])

        # Table two should have versioned column
        self.assertEquals(
            table_two,
            [
                {previous_column_name: datetime.datetime(2019, 2, 1, 15, 12, 45), 'C_INT': 1, 'C_PK': 1,
                 'C_VARCHAR': '1', 'C_DATE': None},
                {previous_column_name: datetime.datetime(2019, 2, 10, 2), 'C_INT': 2, 'C_PK': 2, 'C_VARCHAR': '2',
                 'C_DATE': '2019-02-12 02:00:00'},
                {previous_column_name: None, 'C_INT': 3, 'C_PK': 3, 'C_VARCHAR': '2', 'C_DATE': '2019-02-15 02:00:00'}
            ]
        )

        # Table three should have renamed columns
        self.assertEqual(
            table_three,
            [
                {'C_INT': 1, 'C_PK': 1, 'C_TIME': datetime.time(4, 0), 'C_VARCHAR': '1', 'C_TIME_RENAMED': None},
                {'C_INT': 2, 'C_PK': 2, 'C_TIME': datetime.time(7, 15), 'C_VARCHAR': '2', 'C_TIME_RENAMED': None},
                {'C_INT': 3, 'C_PK': 3, 'C_TIME': datetime.time(23, 0, 3), 'C_VARCHAR': '3',
                 'C_TIME_RENAMED': datetime.time(8, 15)},
                {'C_INT': 4, 'C_PK': 4, 'C_TIME': None, 'C_VARCHAR': '4', 'C_TIME_RENAMED': datetime.time(23, 0, 3)}
            ])
示例#2
0
    def test_nested_schema_unflattening(self):
        """Loading nested JSON objects with no props without flattening"""
        tap_lines = test_utils.get_test_tap_lines('messages-with-nested-schema.json')

        # Load with default settings - Flattening disabled
        self.persist_lines(tap_lines)

        # Get loaded rows from tables - Transform JSON to string at query time
        bigquery = DbSync(self.config)
        target_schema = self.config.get('default_target_schema', '')
        unflattened_table = query(bigquery, """
            SELECT c_pk
                  , c_array c_array
                  , c_object c_object
                  , c_object c_object_with_props
                  , c_nested_object c_nested_object
              FROM {}.test_table_nested_schema
             ORDER BY c_pk""".format(target_schema))

        # Should be valid nested JSON strings
        self.assertEqual(
            unflattened_table,
            [{
                'c_pk': 1,
                'c_array': '[1, 2, 3]',
                'c_object': '{"key_1": "value_1"}',
                'c_object_with_props': '{"key_1": "value_1"}',
                'c_nested_object': {'nested_prop_1': 'nested_value_1',
                                    'nested_prop_2': 'nested_value_2',
                                    'nested_prop_3': {'multi_nested_prop_1': 'multi_value_1',
                                                      'multi_nested_prop_2': 'multi_value_2'}},
            }])
示例#3
0
    def test_flush_streams_with_no_intermediate_flushes(self, mock_emit_state):
        """Test emitting states when no intermediate flush required"""
        mock_emit_state.get.return_value = None
        tap_lines = test_utils.get_test_tap_lines('messages-pg-logical-streams.json')

        # Set batch size big enough to never has to flush in the middle
        self.config['hard_delete'] = True
        self.config['batch_size_rows'] = 1000
        self.persist_lines(tap_lines)

        # State should be emitted only once with the latest received STATE message
        self.assertEquals(
            mock_emit_state.mock_calls,
            [
                mock.call({"currently_syncing": None, "bookmarks": {
                    "logical1-logical1_edgydata": {"last_replication_method": "LOG_BASED", "lsn": 108240872, "version": 1570922723596, "xmin": None},
                    "logical1-logical1_table1": {"last_replication_method": "LOG_BASED", "lsn": 108240872, "version": 1570922723618, "xmin": None},
                    "logical1-logical1_table2": {"last_replication_method": "LOG_BASED", "lsn": 108240872, "version": 1570922723635, "xmin": None},
                    "logical2-logical2_table1": {"last_replication_method": "LOG_BASED", "lsn": 108240872, "version": 1570922723651, "xmin": None},
                    "public-city": {"last_replication_method": "INCREMENTAL", "replication_key": "id", "version": 1570922723667, "replication_key_value": 4079},
                    "public-country": {"last_replication_method": "FULL_TABLE", "version": 1570922730456, "xmin": None},
                    "public2-wearehere": {}}})
            ])

        # Every table should be loaded correctly
        self.assert_logical_streams_are_in_bigquery(True)
    def test_nested_schema_unflattening(self):
        """Loading nested JSON objects into VARIANT columns without flattening"""
        tap_lines = test_utils.get_test_tap_lines(
            'messages-with-nested-schema.json')

        # Load with default settings - Flattening disabled
        self.persist_lines_with_cache(tap_lines)

        # Get loaded rows from tables - Transform JSON to string at query time
        snowflake = DbSync(self.config)
        target_schema = self.config.get('default_target_schema', '')
        unflattened_table = snowflake.query("""
            SELECT c_pk
                  ,TO_CHAR(c_array) c_array
                  ,TO_CHAR(c_object) c_object
                  ,TO_CHAR(c_object) c_object_with_props
                  ,TO_CHAR(c_nested_object) c_nested_object
              FROM {}.test_table_nested_schema
             ORDER BY c_pk""".format(target_schema))

        # Should be valid nested JSON strings
        self.assertEqual(unflattened_table, [{
            'C_PK':
            1,
            'C_ARRAY':
            '[1,2,3]',
            'C_OBJECT':
            '{"key_1":"value_1"}',
            'C_OBJECT_WITH_PROPS':
            '{"key_1":"value_1"}',
            'C_NESTED_OBJECT':
            '{"nested_prop_1":"nested_value_1","nested_prop_2":"nested_value_2","nested_prop_3":{"multi_nested_prop_1":"multi_value_1","multi_nested_prop_2":"multi_value_2"}}'
        }])
    def test_nested_schema_flattening(self):
        """Loading nested JSON objects with flattening and not not flattening"""
        tap_lines = test_utils.get_test_tap_lines('messages-with-nested-schema.json')

        # Turning on data flattening
        self.config['data_flattening_max_level'] = 10

        # Load with default settings - Flattening disabled
        self.persist_lines_with_cache(tap_lines)

        # Get loaded rows from tables
        snowflake = DbSync(self.config)
        target_schema = self.config.get('default_target_schema', '')
        flattened_table = snowflake.query(
            "SELECT * FROM {}.test_table_nested_schema ORDER BY c_pk".format(target_schema))

        # Should be flattened columns
        self.assertEqual(
            flattened_table,
            [{
                'C_PK': 1,
                'C_ARRAY': '[\n  1,\n  2,\n  3\n]',
                'C_OBJECT': None,
                # Cannot map RECORD to SCHEMA. SCHEMA doesn't have properties that requires for flattening
                'C_OBJECT_WITH_PROPS__KEY_1': 'value_1',
                'C_NESTED_OBJECT__NESTED_PROP_1': 'nested_value_1',
                'C_NESTED_OBJECT__NESTED_PROP_2': 'nested_value_2',
                'C_NESTED_OBJECT__NESTED_PROP_3__MULTI_NESTED_PROP_1': 'multi_value_1',
                'C_NESTED_OBJECT__NESTED_PROP_3__MULTI_NESTED_PROP_2': 'multi_value_2',
            }])
    def test_loading_csv_files(self):
        """Loading multiple tables from the same input tap with various columns types"""
        tap_lines = test_utils.get_test_tap_lines(
            'messages-with-three-streams.json')

        self.persist_messages(tap_lines)
        self.assert_three_streams_are_in_s3_bucket()
    def test_information_schema_cache_outdated(self):
        """If informations schema cache is not up to date then it should fail"""
        tap_lines_with_multi_streams = test_utils.get_test_tap_lines(
            "messages-with-three-streams.json")

        # 1) Simulate an out of data cache:
        # Table is in cache but not exists in database
        snowflake = DbSync(self.config)
        target_schema = self.config.get("default_target_schema", "").upper()
        snowflake.query("""
            CREATE TABLE IF NOT EXISTS {}.columns (table_schema VARCHAR, table_name VARCHAR, column_name VARCHAR, data_type VARCHAR)
        """.format(snowflake.pipelinewise_schema))
        snowflake.query("""
            INSERT INTO {0}.columns (table_schema, table_name, column_name, data_type)
            SELECT '{1}', 'TEST_TABLE_ONE', 'DUMMY_COLUMN_1', 'TEXT' UNION
            SELECT '{1}', 'TEST_TABLE_ONE', 'DUMMY_COLUMN_2', 'TEXT' UNION
            SELECT '{1}', 'TEST_TABLE_TWO', 'DUMMY_COLUMN_3', 'TEXT'
        """.format(snowflake.pipelinewise_schema, target_schema))

        # Loading into an outdated information_schema cache should fail with table not exists
        with self.assertRaises(Exception):
            self.persist_lines_with_cache(tap_lines_with_multi_streams)

        # 2) Simulate an out of data cache:
        # Table is in cache structure is not in sync with the actual table in the database
        snowflake.query("CREATE SCHEMA IF NOT EXISTS {}".format(target_schema))
        snowflake.query(
            "CREATE OR REPLACE TABLE {}.test_table_one (C_PK NUMBER, C_INT NUMBER, C_VARCHAR TEXT)"
            .format(target_schema))

        # Loading into an outdated information_schema cache should fail with columns exists
        # It should try adding the new column based on the values in cache but the column already exists
        with self.assertRaises(Exception):
            self.persist_lines_with_cache(tap_lines_with_multi_streams)
示例#8
0
    def test_nested_schema_flattening(self):
        """Loading nested JSON objects with flattening and not not flattening"""
        tap_lines = test_utils.get_test_tap_lines('messages-with-nested-schema.json')

        # Turning on data flattening
        self.config['data_flattening_max_level'] = 10

        # Load with default settings - Flattening disabled
        self.persist_lines(tap_lines)

        # Get loaded rows from tables
        bigquery = DbSync(self.config)
        target_schema = self.config.get('default_target_schema', '')
        flattened_table = query(bigquery,
            "SELECT * FROM {}.test_table_nested_schema ORDER BY c_pk".format(target_schema))

        # Should be flattened columns
        self.assertEqual(
            flattened_table,
            [{
                'c_pk': 1,
                'c_array': '[1, 2, 3]',
                'c_object': None,
                'c_object_with_props__key_1': 'value_1',
                'c_nested_object__nested_prop_1': 'nested_value_1',
                'c_nested_object__nested_prop_2': 'nested_value_2',
                'c_nested_object__nested_prop_3__multi_nested_prop_1': 'multi_value_1',
                'c_nested_object__nested_prop_3__multi_nested_prop_2': 'multi_value_2',
            }])
示例#9
0
    def test_naming_convention(self):
        tap_lines = test_utils.get_test_tap_lines(
            'messages-with-three-streams.json')

        self.config['naming_convention'] = "tester/{stream}/{timestamp}.csv"
        self.persist_messages(tap_lines)
        self.assert_three_streams_are_in_s3_bucket()
    def test_table_with_pk_multi_column_removed(self):
        """Test table with a pk with multiple columns gets clustered by those and removing the pk doesnt cause errors"""
        tap_lines = test_utils.get_test_tap_lines('table_with_multi_pk_cluster.json')
        self.persist_lines(tap_lines)

        # Get loaded rows from tables
        bigquery = DbSync(self.config)
        target_schema = self.config.get('default_target_schema', '')
        table = query(bigquery, "SELECT * FROM {}.test_table_cluster_multi ORDER BY c_pk".format(target_schema))
        cluster_columns = query(bigquery, "SELECT clustering_ordinal_position, column_name FROM {}.INFORMATION_SCHEMA.COLUMNS WHERE table_name = 'test_table_cluster_multi' AND clustering_ordinal_position > 0 ORDER BY 1".format(target_schema))

        # ----------------------------------------------------------------------
        # Check rows in table
        # ----------------------------------------------------------------------
        expected_table = [
            {'c_pk': 2, 'c_int': 2, 'c_varchar': '2', 'c_date': datetime.datetime(2019, 2, 12, 2, 0, 0, tzinfo=timezone.utc)},
            {'c_pk': 3, 'c_int': 3, 'c_varchar': '2', 'c_date': datetime.datetime(2019, 2, 15, 2, 0, 0, tzinfo=timezone.utc)}
        ]

        expected_cluster_columns = [
            {'clustering_ordinal_position': 1, 'column_name': 'c_pk'},
            {'clustering_ordinal_position': 2, 'column_name': 'c_varchar'}
        ]

        self.assertEqual(self.remove_metadata_columns_from_rows(table), expected_table)
        self.assertEqual(cluster_columns, expected_cluster_columns)

        # ----------------------------------------------------------------------
        # Remove the primary key and check if clustering stayed unchanged
        # ----------------------------------------------------------------------
        self.config['primary_key_required'] = False
        tap_lines = test_utils.get_test_tap_lines('table_with_multi_pk_cluster_changed.json')
        self.persist_lines(tap_lines)

        table_changed = query(bigquery, "SELECT * FROM {}.test_table_cluster_multi ORDER BY c_pk".format(target_schema))
        cluster_columns_changed = query(bigquery, "SELECT clustering_ordinal_position, column_name FROM {}.INFORMATION_SCHEMA.COLUMNS WHERE table_name = 'test_table_cluster_multi' AND clustering_ordinal_position > 0 ORDER BY 1".format(target_schema))

        expected_table_changed = [
            {'c_pk': 2, 'c_int': 2, 'c_varchar': '2', 'c_date': datetime.datetime(2019, 2, 12, 2, 0, 0, tzinfo=timezone.utc)},
            {'c_pk': 2, 'c_int': 2, 'c_varchar': '2', 'c_date': datetime.datetime(2019, 2, 12, 2, 0, 0, tzinfo=timezone.utc)},
            {'c_pk': 3, 'c_int': 3, 'c_varchar': '2', 'c_date': datetime.datetime(2019, 2, 15, 2, 0, 0, tzinfo=timezone.utc)}
        ]

        expected_cluster_columns_changed = []

        self.assertEqual(self.remove_metadata_columns_from_rows(table_changed), expected_table_changed)
        self.assertEqual(cluster_columns_changed, expected_cluster_columns)
    def test_loading_csv_files_with_gzip_compression(self):
        """Loading multiple tables from the same input tap with gzip compression"""
        tap_lines = test_utils.get_test_tap_lines('messages-with-three-streams.json')

        # Turning on gzip compression
        self.config['compression'] = 'gzip'
        self.persist_messages(tap_lines)
        self.assert_three_streams_are_in_s3_bucket(compression='gzip')
    def test_loading_tables_with_client_side_encryption_and_wrong_master_key(self):
        """Loading multiple tables from the same input tap with various columns types"""
        tap_lines = test_utils.get_test_tap_lines('messages-with-three-streams.json')

        # Turning on client-side encryption and load but using a well formatted but wrong master key
        self.config['client_side_encryption_master_key'] = "Wr0n6m45t3rKeY0123456789a0123456789a0123456="
        with assert_raises(ProgrammingError):
            self.persist_lines_with_cache(tap_lines)
示例#13
0
    def test_loading_tables_with_no_encryption(self):
        """Loading multiple tables from the same input tap with various columns types"""
        tap_lines = test_utils.get_test_tap_lines('messages-with-three-streams.json')

        # Turning off client-side encryption and load
        self.config['client_side_encryption_master_key'] = ''
        self.persist_lines(tap_lines)

        self.assert_three_streams_are_into_bigquery()
示例#14
0
    def test_logical_streams_from_pg_with_hard_delete_and_default_batch_size_should_pass(self):
        """Tests logical streams from pg with inserts, updates and deletes"""
        tap_lines = test_utils.get_test_tap_lines('messages-pg-logical-streams.json')

        # Turning on hard delete mode
        self.config['hard_delete'] = True
        self.persist_lines(tap_lines)

        self.assert_logical_streams_are_in_bigquery(True)
    def test_loading_tables_with_custom_temp_dir(self):
        """Loading multiple tables from the same input tap using custom temp directory"""
        tap_lines = test_utils.get_test_tap_lines('messages-with-three-streams.json')

        # Use custom temp_dir
        self.config['temp_dir'] = ('~/.pipelinewise/tmp')
        self.persist_messages(tap_lines)

        self.assert_three_streams_are_in_s3_bucket()
示例#16
0
    def test_loading_unicode_characters(self):
        """Loading unicode encoded characters"""
        tap_lines = test_utils.get_test_tap_lines(
            'messages-with-unicode-characters.json')

        # Load with default settings
        target_snowflake.persist_lines(self.config, tap_lines)

        # Get loaded rows from tables
        snowflake = DbSync(self.config)
        target_schema = self.config.get('schema', '')
        table_unicode = snowflake.query(
            "SELECT * FROM {}.test_table_unicode".format(target_schema))

        self.assertEqual(table_unicode, [{
            'C_INT':
            1,
            'C_PK':
            1,
            'C_VARCHAR':
            'Hello world, Καλημέρα κόσμε, コンニチハ'
        }, {
            'C_INT':
            2,
            'C_PK':
            2,
            'C_VARCHAR':
            'Chinese: 和毛泽东 <<重上井冈山>>. 严永欣, 一九八八年.'
        }, {
            'C_INT':
            3,
            'C_PK':
            3,
            'C_VARCHAR':
            'Russian: Зарегистрируйтесь сейчас на Десятую Международную Конференцию по'
        }, {
            'C_INT':
            4,
            'C_PK':
            4,
            'C_VARCHAR':
            'Thai: แผ่นดินฮั่นเสื่อมโทรมแสนสังเวช'
        }, {
            'C_INT':
            5,
            'C_PK':
            5,
            'C_VARCHAR':
            'Arabic: لقد لعبت أنت وأصدقاؤك لمدة وحصلتم علي من إجمالي النقاط'
        }, {
            'C_INT':
            6,
            'C_PK':
            6,
            'C_VARCHAR':
            'Special Characters: [",\'!@£$%^&*()]'
        }])
示例#17
0
    def test_loading_tables_with_client_side_encryption(self):
        """Loading multiple tables from the same input tap with various columns types"""
        tap_lines = test_utils.get_test_tap_lines('messages-with-three-streams.json')

        # Turning on client-side encryption and load
        self.config['client_side_encryption_master_key'] = os.environ.get('CLIENT_SIDE_ENCRYPTION_MASTER_KEY')
        self.persist_lines(tap_lines)

        self.assert_three_streams_are_into_bigquery()
示例#18
0
    def test_column_name_change(self):
        """Tests correct renaming of bigquery columns after source change"""
        tap_lines_before_column_name_change = test_utils.get_test_tap_lines('messages-with-three-streams.json')
        tap_lines_after_column_name_change = test_utils.get_test_tap_lines(
            'messages-with-three-streams-modified-column.json')

        # Load with default settings
        self.persist_lines(tap_lines_before_column_name_change)
        self.persist_lines(tap_lines_after_column_name_change)

        # Get loaded rows from tables
        bigquery = DbSync(self.config)
        target_schema = self.config.get('default_target_schema', '')
        table_one = query(bigquery, "SELECT * FROM {}.test_table_one ORDER BY c_pk".format(target_schema))
        table_two = query(bigquery, "SELECT * FROM {}.test_table_two ORDER BY c_pk".format(target_schema))
        table_three = query(bigquery, "SELECT * FROM {}.test_table_three ORDER BY c_pk".format(target_schema))

        # Table one should have no changes
        self.assertEqual(
            table_one,
            [{'c_int': 1, 'c_pk': 1, 'c_varchar': '1'}])

        # Table two should have versioned column
        self.assertEquals(
            table_two,
            [
                {'c_int': 1, 'c_pk': 1,
                 'c_varchar': '1', 'c_date': datetime.datetime(2019, 2, 1, 15, 12, 45, tzinfo=timezone.utc), 'c_date__st': None},
                {'c_int': 2, 'c_pk': 2, 'c_varchar': '2',
                 'c_date': datetime.datetime(2019, 2, 10, 2, tzinfo=timezone.utc), 'c_date__st': '2019-02-12 02:00:00'},
                {'c_int': 3, 'c_pk': 3, 'c_varchar': '2', 'c_date': None, 'c_date__st': '2019-02-15 02:00:00'}
            ]
        )

        # Table three should have renamed columns
        self.assertEqual(
            table_three,
            [
                {'c_int': 1, 'c_pk': 1, 'c_time': datetime.time(4, 0), 'c_varchar': '1', 'c_time_renamed': None},
                {'c_int': 2, 'c_pk': 2, 'c_time': datetime.time(7, 15), 'c_varchar': '2', 'c_time_renamed': None},
                {'c_int': 3, 'c_pk': 3, 'c_time': datetime.time(23, 0, 3), 'c_varchar': '3',
                 'c_time_renamed': datetime.time(8, 15)},
                {'c_int': 4, 'c_pk': 4, 'c_time': None, 'c_varchar': '4', 'c_time_renamed': datetime.time(23, 0, 3)}
            ])
    def test_loading_unicode_characters(self):
        """Loading unicode encoded characters"""
        tap_lines = test_utils.get_test_tap_lines(
            "messages-with-unicode-characters.json")

        # Load with default settings
        self.persist_lines_with_cache(tap_lines)

        # Get loaded rows from tables
        snowflake = DbSync(self.config)
        target_schema = self.config.get("default_target_schema", "")
        table_unicode = snowflake.query(
            "SELECT * FROM {}.test_table_unicode ORDER BY C_INT".format(
                target_schema))

        self.assertEqual(
            table_unicode,
            [
                {
                    "C_INT": 1,
                    "C_PK": 1,
                    "C_VARCHAR": "Hello world, Καλημέρα κόσμε, コンニチハ"
                },
                {
                    "C_INT": 2,
                    "C_PK": 2,
                    "C_VARCHAR": "Chinese: 和毛泽东 <<重上井冈山>>. 严永欣, 一九八八年."
                },
                {
                    "C_INT":
                    3,
                    "C_PK":
                    3,
                    "C_VARCHAR":
                    "Russian: Зарегистрируйтесь сейчас на Десятую Международную Конференцию по",
                },
                {
                    "C_INT": 4,
                    "C_PK": 4,
                    "C_VARCHAR": "Thai: แผ่นดินฮั่นเสื่อมโทรมแสนสังเวช"
                },
                {
                    "C_INT":
                    5,
                    "C_PK":
                    5,
                    "C_VARCHAR":
                    "Arabic: لقد لعبت أنت وأصدقاؤك لمدة وحصلتم علي من إجمالي النقاط",
                },
                {
                    "C_INT": 6,
                    "C_PK": 6,
                    "C_VARCHAR": "Special Characters: [\",'!@£$%^&*()]"
                },
            ],
        )
示例#20
0
    def test_loading_tables_with_metadata_columns(self):
        """Loading multiple tables from the same input tap with various columns types"""
        tap_lines = test_utils.get_test_tap_lines('messages-with-three-streams.json')

        # Turning on adding metadata columns
        self.config['add_metadata_columns'] = True
        self.persist_lines(tap_lines)

        # Check if data loaded correctly and metadata columns exist
        self.assert_three_streams_are_into_bigquery(should_metadata_columns_exist=True)
    def test_loading_csv_files_with_invalid_compression(self):
        """Loading multiple tables from the same input tap with invalid compression"""
        tap_lines = test_utils.get_test_tap_lines('messages-with-three-streams.json')

        # Turning on a not supported compression method
        self.config['compression'] = 'INVALID_COMPRESSION_METHOD'

        # Invalid compression method should raise exception
        with assert_raises(NotImplementedError):
            self.persist_messages(tap_lines)
示例#22
0
    def test_logical_streams_from_pg_with_hard_delete_and_batch_size_of_5_and_no_records_should_pass(self):
        """Tests logical streams from pg with inserts, updates and deletes"""
        tap_lines = test_utils.get_test_tap_lines('messages-pg-logical-streams-no-records.json')

        # Turning on hard delete mode
        self.config['hard_delete'] = True
        self.config['batch_size_rows'] = 5
        self.persist_lines(tap_lines)

        self.assert_logical_streams_are_in_bigquery_and_are_empty()
    def test_loading_tables_with_no_encryption(self):
        """Loading multiple tables from the same input tap with various columns types"""
        tap_lines = test_utils.get_test_tap_lines(
            "messages-with-three-streams.json")

        # Turning off client-side encryption and load
        self.config["client_side_encryption_master_key"] = ""
        self.persist_lines_with_cache(tap_lines)

        self.assert_three_streams_are_into_snowflake()
    def test_logical_streams_from_pg_with_hard_delete_and_batch_size_of_5(self):
        """Tests logical streams from pg with inserts, updates and deletes"""
        tap_lines = test_utils.get_test_tap_lines('messages-logical-streams.json')

        # Turning on hard delete mode
        self.config['hard_delete'] = True
        self.config['batch_size_rows'] = 5
        self.persist_lines_with_cache(tap_lines)

        self.assert_logical_streams_are_in_snowflake(True)
示例#25
0
    def test_loading_tables_with_defined_parallelism(self):
        """Loading multiple tables from the same input tap with various columns types"""
        tap_lines = test_utils.get_test_tap_lines('messages-with-three-streams.json')

        # Using fixed 1 thread parallelism
        self.config['parallelism'] = 1
        self.persist_lines(tap_lines)

        # Check if data loaded correctly and metadata columns exist
        self.assert_three_streams_are_into_bigquery()
    def test_table_with_pk_adds_clustering(self):
        """Tests table with a primary key gets clustered on those fields"""
        tap_lines = test_utils.get_test_tap_lines('table_with_pk_cluster.json')
        self.persist_lines(tap_lines)

        # Get loaded rows from tables
        bigquery = DbSync(self.config)
        target_schema = self.config.get('default_target_schema', '')
        table = query(bigquery, "SELECT * FROM {}.test_table_cluster ORDER BY c_pk".format(target_schema))
        cluster_columns = query(bigquery, "SELECT clustering_ordinal_position, column_name FROM {}.INFORMATION_SCHEMA.COLUMNS WHERE table_name = 'test_table_cluster' AND clustering_ordinal_position > 0 ORDER BY 1".format(target_schema))

        # ----------------------------------------------------------------------
        # Check rows in table
        # ----------------------------------------------------------------------
        expected_table = [
            {'c_pk': 2, 'c_int': 2, 'c_varchar': '2', 'c_date': datetime.datetime(2019, 2, 12, 2, 0, 0, tzinfo=timezone.utc)},
            {'c_pk': 3, 'c_int': 3, 'c_varchar': '2', 'c_date': datetime.datetime(2019, 2, 15, 2, 0, 0, tzinfo=timezone.utc)}
        ]

        expected_cluster_columns = [
            {'clustering_ordinal_position': 1, 'column_name': 'c_pk'},
        ]

        self.assertEqual(self.remove_metadata_columns_from_rows(table), expected_table)
        self.assertEqual(cluster_columns, expected_cluster_columns)

        # ----------------------------------------------------------------------
        # Change the primary key and check if clustering stayed unchanged
        # ----------------------------------------------------------------------
        tap_lines = test_utils.get_test_tap_lines('table_with_pk_cluster_changed.json')
        self.persist_lines(tap_lines)

        table_changed = query(bigquery, "SELECT * FROM {}.test_table_cluster ORDER BY c_pk".format(target_schema))
        cluster_columns_changed = query(bigquery, "SELECT clustering_ordinal_position, column_name FROM {}.INFORMATION_SCHEMA.COLUMNS WHERE table_name = 'test_table_cluster' AND clustering_ordinal_position > 0 ORDER BY 1".format(target_schema))

        expected_table_changed = [
            {'c_pk': 2, 'c_int': 2, 'c_varchar': 'c', 'c_date': datetime.datetime(2019, 2, 12, 2, 0, 0, tzinfo=timezone.utc)},
            {'c_pk': 3, 'c_int': 3, 'c_varchar': 'c', 'c_date': datetime.datetime(2022, 5, 15, 5, 0, 0, tzinfo=timezone.utc)}
        ]

        self.assertEqual(self.remove_metadata_columns_from_rows(table_changed), expected_table_changed)
        self.assertEqual(cluster_columns_changed, expected_cluster_columns)
    def test_table_with_no_pk(self):
        """Tests table with a primary key gets clustered on those fields"""
        tap_lines = test_utils.get_test_tap_lines('table_with_no_pk.json')
        self.config['primary_key_required'] = False
        self.persist_lines(tap_lines)

        # Get loaded rows from tables
        bigquery = DbSync(self.config)
        target_schema = self.config.get('default_target_schema', '')
        table = query(bigquery, "SELECT * FROM {}.test_table_no_pk ORDER BY c_id".format(target_schema))
        self.assertEqual(len(table), 2)
    def test_loading_tables_with_hard_delete(self):
        """Loading multiple tables from the same input tap with deleted rows"""
        tap_lines = test_utils.get_test_tap_lines(
            "messages-with-three-streams.json")

        # Turning on hard delete mode
        self.config["hard_delete"] = True
        self.persist_lines_with_cache(tap_lines)

        # Check if data loaded correctly and metadata columns exist
        self.assert_three_streams_are_into_snowflake(
            should_metadata_columns_exist=True, should_hard_deleted_rows=True)
    def test_loading_with_multiple_schema(self):
        """Loading table with multiple SCHEMA messages"""
        tap_lines = test_utils.get_test_tap_lines(
            "messages-with-multi-schemas.json")

        # Load with default settings
        self.persist_lines_with_cache(tap_lines)

        # Check if data loaded correctly
        self.assert_three_streams_are_into_snowflake(
            should_metadata_columns_exist=False,
            should_hard_deleted_rows=False)
    def test_non_db_friendly_columns(self):
        """Loading non-db friendly columns like, camelcase, minus signs, etc."""
        tap_lines = test_utils.get_test_tap_lines(
            "messages-with-non-db-friendly-columns.json")

        # Load with default settings
        self.persist_lines_with_cache(tap_lines)

        # Get loaded rows from tables
        snowflake = DbSync(self.config)
        target_schema = self.config.get("default_target_schema", "")
        table_non_db_friendly_columns = snowflake.query(
            "SELECT * FROM {}.test_table_non_db_friendly_columns ORDER BY c_pk"
            .format(target_schema))

        self.assertEqual(
            table_non_db_friendly_columns,
            [
                {
                    "C_PK": 1,
                    "CAMELCASECOLUMN": "Dummy row 1",
                    "MINUS-COLUMN": "Dummy row 1"
                },
                {
                    "C_PK": 2,
                    "CAMELCASECOLUMN": "Dummy row 2",
                    "MINUS-COLUMN": "Dummy row 2"
                },
                {
                    "C_PK": 3,
                    "CAMELCASECOLUMN": "Dummy row 3",
                    "MINUS-COLUMN": "Dummy row 3"
                },
                {
                    "C_PK": 4,
                    "CAMELCASECOLUMN": "Dummy row 4",
                    "MINUS-COLUMN": "Dummy row 4"
                },
                {
                    "C_PK": 5,
                    "CAMELCASECOLUMN": "Dummy row 5",
                    "MINUS-COLUMN": "Dummy row 5"
                },
            ],
        )