def test_loading__invalid__configuration__schema(): stream = CatStream(1) stream.schema = deepcopy(stream.schema) stream.schema['schema']['type'] = 'invalid type for a JSON Schema' with pytest.raises(Exception, match=r'.*invalid JSON Schema instance.*'): main(CONFIG, input_stream=stream)
def test_loading__simple(db_cleanup): stream = CatStream(100) main(CONFIG, input_stream=stream) with psycopg2.connect(**TEST_DB) as conn: with conn.cursor() as cur: cur.execute(get_columns_sql('cats')) columns = cur.fetchall() assert set(columns) == { ('_sdc_batched_at', 'timestamp with time zone', 'YES'), ('_sdc_received_at', 'timestamp with time zone', 'YES'), ('_sdc_sequence', 'bigint', 'YES'), ('_sdc_table_version', 'bigint', 'YES'), ('adoption__adopted_on', 'timestamp with time zone', 'YES'), ('adoption__was_foster', 'boolean', 'YES'), ('age', 'bigint', 'YES'), ('id', 'bigint', 'NO'), ('name', 'text', 'NO'), ('pattern', 'text', 'YES') } cur.execute(get_columns_sql('cats__adoption__immunizations')) columns = cur.fetchall() assert set(columns) == {('_sdc_level_0_id', 'bigint', 'NO'), ('_sdc_sequence', 'bigint', 'YES'), ('_sdc_source_key_id', 'bigint', 'NO'), ('date_administered', 'timestamp with time zone', 'YES'), ('type', 'text', 'YES')} cur.execute(get_count_sql('cats')) assert cur.fetchone()[0] == 100 assert_records(conn, stream.records, 'cats', 'id')
def test_deduplication_older_rows(db_cleanup): stream = CatStream(100, nested_count=2, duplicates=2, duplicate_sequence_delta=-100) main(CONFIG, input_stream=stream) with psycopg2.connect(**TEST_DB) as conn: with conn.cursor() as cur: cur.execute(get_count_sql('cats')) table_count = cur.fetchone()[0] cur.execute(get_count_sql('cats__adoption__immunizations')) nested_table_count = cur.fetchone()[0] cur.execute( 'SELECT _sdc_sequence FROM cats WHERE id in ({})'.format( ','.join(map(str, stream.duplicate_pks_used)))) dup_cat_records = cur.fetchall() assert stream.record_message_count == 102 assert table_count == 100 assert nested_table_count == 200 for record in dup_cat_records: assert record[0] == stream.sequence
def test_bigcommerce__sandbox(db_cleanup): main(CONFIG, input_stream=BigCommerceStream()) with psycopg2.connect(**TEST_DB) as conn: with conn.cursor() as cur: assert_tables_equal( cur, { 'products', 'customers', 'products__categories', 'products__related_products' }) ## form_fields should not show up as it can only be `null` assert_columns_equal( cur, 'customers', {('_sdc_table_version', 'bigint', 'YES'), ('_sdc_received_at', 'timestamp with time zone', 'YES'), ('_sdc_sequence', 'bigint', 'YES'), ('_sdc_batched_at', 'timestamp with time zone', 'YES'), ('id', 'bigint', 'NO'), ('date_modified', 'timestamp with time zone', 'NO'), ('store_credit', 'text', 'YES'), ('notes', 'text', 'YES'), ('tax_exempt_category', 'text', 'YES'), ('email', 'text', 'YES'), ('company', 'text', 'YES'), ('customer_group_id', 'bigint', 'YES'), ('registration_ip_address', 'text', 'YES'), ('date_created', 'timestamp with time zone', 'NO'), ('accepts_marketing', 'boolean', 'YES'), ('addresses__resource', 'text', 'YES'), ('reset_pass_on_login', 'boolean', 'YES'), ('addresses__url', 'text', 'YES'), ('first_name', 'text', 'YES'), ('phone', 'text', 'YES'), ('last_name', 'text', 'YES')})
def test_multiple_batches_by_memory_upsert(db_cleanup): config = CONFIG.copy() config['max_batch_size'] = 1024 config['batch_detection_threshold'] = 5 stream = CatStream(100, nested_count=2) main(config, input_stream=stream) with psycopg2.connect(**TEST_DB) as conn: with conn.cursor() as cur: cur.execute(get_count_sql('cats')) assert cur.fetchone()[0] == 100 cur.execute(get_count_sql('cats__adoption__immunizations')) assert cur.fetchone()[0] == 200 assert_records(conn, stream.records, 'cats', 'id') stream = CatStream(100, nested_count=3) main(config, input_stream=stream) with psycopg2.connect(**TEST_DB) as conn: with conn.cursor() as cur: cur.execute(get_count_sql('cats')) assert cur.fetchone()[0] == 100 cur.execute(get_count_sql('cats__adoption__immunizations')) assert cur.fetchone()[0] == 300 assert_records(conn, stream.records, 'cats', 'id')
def test_loading__invalid__default_null_value__non_nullable_column(): class NullDefaultCatStream(CatStream): def generate_record(self): record = CatStream.generate_record(self) record['name'] = postgres.RESERVED_NULL_DEFAULT return record with pytest.raises(postgres.PostgresError, match=r'.*IntegrityError.*'): main(CONFIG, input_stream=NullDefaultCatStream(20))
def test_upsert__invalid__primary_key_change(db_cleanup): stream = CatStream(100) main(CONFIG, input_stream=stream) stream = CatStream(100) schema = deepcopy(stream.schema) schema['key_properties'].append('name') stream.schema = schema with pytest.raises(postgres.PostgresError, match=r'.*key_properties.*'): main(CONFIG, input_stream=stream)
def test_loading__invalid__column_type_change__pks__nullable(): main(CONFIG, input_stream=CatStream(20)) stream = CatStream(20) stream.schema = deepcopy(stream.schema) stream.schema['schema']['properties']['id'] = json_schema.make_nullable( stream.schema['schema']['properties']['id']) with pytest.raises(postgres.PostgresError, match=r'.*key_properties. type change detected'): main(CONFIG, input_stream=stream)
def test_multiple_batches_by_memory(db_cleanup): with patch.object(postgres.PostgresTarget, 'write_batch', side_effect=mocked_mock_write_batch) as mock_write_batch: config = CONFIG.copy() config['max_batch_size'] = 1024 config['batch_detection_threshold'] = 5 stream = CatStream(100) main(config, input_stream=stream) assert mock_write_batch.call_count == 21
def test_loading__invalid__records__disable(): config = deepcopy(CONFIG) config['invalid_records_detect'] = False main(config, input_stream=InvalidCatStream(100)) with psycopg2.connect(**TEST_DB) as conn: with conn.cursor() as cur: cur.execute(get_columns_sql('cats')) # No columns for a non existent table ## Since all `cat`s records were invalid, we could not persist them, hence, no table created assert not cur.fetchall()
def test_loading__new_non_null_column(db_cleanup): cat_count = 50 main(CONFIG, input_stream=CatStream(cat_count)) class NonNullStream(CatStream): def generate_record(self): record = CatStream.generate_record(self) record['id'] = record['id'] + cat_count return record non_null_stream = NonNullStream(cat_count) non_null_stream.schema = deepcopy(non_null_stream.schema) non_null_stream.schema['schema']['properties']['paw_toe_count'] = { 'type': 'integer', 'default': 5 } main(CONFIG, input_stream=non_null_stream) with psycopg2.connect(**TEST_DB) as conn: with conn.cursor() as cur: cur.execute(get_columns_sql('cats')) columns = cur.fetchall() assert set(columns) == { ('_sdc_batched_at', 'timestamp with time zone', 'YES'), ('_sdc_received_at', 'timestamp with time zone', 'YES'), ('_sdc_sequence', 'bigint', 'YES'), ('_sdc_table_version', 'bigint', 'YES'), ('adoption__adopted_on', 'timestamp with time zone', 'YES'), ('adoption__was_foster', 'boolean', 'YES'), ('age', 'bigint', 'YES'), ('id', 'bigint', 'NO'), ('name', 'text', 'NO'), ('paw_size', 'bigint', 'NO'), ('paw_colour', 'text', 'NO'), ('paw_toe_count', 'bigint', 'YES'), ('flea_check_complete', 'boolean', 'NO'), ('pattern', 'text', 'YES') } cur.execute( sql.SQL('SELECT {}, {} FROM {}').format( sql.Identifier('id'), sql.Identifier('paw_toe_count'), sql.Identifier('cats'))) persisted_records = cur.fetchall() ## Assert that the split columns before/after new non-null data assert 2 * cat_count == len(persisted_records) assert cat_count == len( [x for x in persisted_records if x[1] is None]) assert cat_count == len( [x for x in persisted_records if x[1] is not None])
def test_loading__invalid__column_type_change__pks(): main(CONFIG, input_stream=CatStream(20)) class StringIdCatStream(CatStream): def generate_record(self): record = CatStream.generate_record(self) record['id'] = str(record['id']) return record stream = StringIdCatStream(20) stream.schema = deepcopy(stream.schema) stream.schema['schema']['properties']['id'] = {'type': 'string'} with pytest.raises(postgres.PostgresError, match=r'.*key_properties. type change detected'): main(CONFIG, input_stream=stream)
def test_nested_delete_on_parent(db_cleanup): stream = CatStream(100, nested_count=3) main(CONFIG, input_stream=stream) with psycopg2.connect(**TEST_DB) as conn: with conn.cursor() as cur: cur.execute(get_count_sql('cats__adoption__immunizations')) high_nested = cur.fetchone()[0] assert_records(conn, stream.records, 'cats', 'id') stream = CatStream(100, nested_count=2) main(CONFIG, input_stream=stream) with psycopg2.connect(**TEST_DB) as conn: with conn.cursor() as cur: cur.execute(get_count_sql('cats__adoption__immunizations')) low_nested = cur.fetchone()[0] assert_records(conn, stream.records, 'cats', 'id') assert low_nested < high_nested
def test_hubspot__sandbox(db_cleanup): config = CONFIG.copy() config['persist_empty_tables'] = True main(config, input_stream=HubspotStream()) with psycopg2.connect(**TEST_DB) as conn: with conn.cursor() as cur: assert_tables_equal(cur, {'deals'}) assert_columns_equal( cur, 'deals', {('_sdc_table_version', 'bigint', 'YES'), ('_sdc_received_at', 'timestamp with time zone', 'YES'), ('_sdc_sequence', 'bigint', 'YES'), ('_sdc_primary_key', 'text', 'NO'), ('_sdc_batched_at', 'timestamp with time zone', 'YES'), ('properties__num_contacted_notes__value__f', 'double precision', 'YES'), ('properties__num_contacted_notes__value__s', 'text', 'YES')}) assert_count_equal(cur, 'deals', 7)
def test_upsert(db_cleanup): stream = CatStream(100) main(CONFIG, input_stream=stream) with psycopg2.connect(**TEST_DB) as conn: with conn.cursor() as cur: cur.execute(get_count_sql('cats')) assert cur.fetchone()[0] == 100 assert_records(conn, stream.records, 'cats', 'id') stream = CatStream(100) main(CONFIG, input_stream=stream) with psycopg2.connect(**TEST_DB) as conn: with conn.cursor() as cur: cur.execute(get_count_sql('cats')) assert cur.fetchone()[0] == 100 assert_records(conn, stream.records, 'cats', 'id') stream = CatStream(200) main(CONFIG, input_stream=stream) with psycopg2.connect(**TEST_DB) as conn: with conn.cursor() as cur: cur.execute(get_count_sql('cats')) assert cur.fetchone()[0] == 200 assert_records(conn, stream.records, 'cats', 'id')
def test_deduplication_existing_new_rows(db_cleanup): stream = CatStream(100, nested_count=2) main(CONFIG, input_stream=stream) original_sequence = stream.sequence stream = CatStream(100, nested_count=2, sequence=original_sequence - 20) main(CONFIG, input_stream=stream) with psycopg2.connect(**TEST_DB) as conn: with conn.cursor() as cur: cur.execute(get_count_sql('cats')) table_count = cur.fetchone()[0] cur.execute(get_count_sql('cats__adoption__immunizations')) nested_table_count = cur.fetchone()[0] cur.execute('SELECT DISTINCT _sdc_sequence FROM cats') sequences = cur.fetchall() assert table_count == 100 assert nested_table_count == 200 assert len(sequences) == 1 assert sequences[0][0] == original_sequence
def test_full_table_replication(db_cleanup): stream = CatStream(110, version=0, nested_count=3) main(CONFIG, input_stream=stream) with psycopg2.connect(**TEST_DB) as conn: with conn.cursor() as cur: cur.execute(get_count_sql('cats')) version_0_count = cur.fetchone()[0] cur.execute(get_count_sql('cats__adoption__immunizations')) version_0_sub_count = cur.fetchone()[0] assert_records(conn, stream.records, 'cats', 'id', match_pks=True) assert version_0_count == 110 assert version_0_sub_count == 330 stream = CatStream(100, version=1, nested_count=3) main(CONFIG, input_stream=stream) with psycopg2.connect(**TEST_DB) as conn: with conn.cursor() as cur: cur.execute(get_count_sql('cats')) version_1_count = cur.fetchone()[0] cur.execute(get_count_sql('cats__adoption__immunizations')) version_1_sub_count = cur.fetchone()[0] assert_records(conn, stream.records, 'cats', 'id', match_pks=True) assert version_1_count == 100 assert version_1_sub_count == 300 stream = CatStream(120, version=2, nested_count=2) main(CONFIG, input_stream=stream) with psycopg2.connect(**TEST_DB) as conn: with conn.cursor() as cur: cur.execute(get_count_sql('cats')) version_2_count = cur.fetchone()[0] cur.execute(get_count_sql('cats__adoption__immunizations')) version_2_sub_count = cur.fetchone()[0] assert_records(conn, stream.records, 'cats', 'id', match_pks=True) assert version_2_count == 120 assert version_2_sub_count == 240
def test_loading__invalid__records(): with pytest.raises(singer_stream.SingerStreamError, match=r'.*'): main(CONFIG, input_stream=InvalidCatStream(1))
def test_loading__invalid__records__threshold(): config = deepcopy(CONFIG) config['invalid_records_threshold'] = 10 with pytest.raises(singer_stream.SingerStreamError, match=r'.*.10*'): main(config, input_stream=InvalidCatStream(20))
def test_loading__column_type_change__nullable(db_cleanup): cat_count = 20 main(CONFIG, input_stream=CatStream(cat_count)) with psycopg2.connect(**TEST_DB) as conn: with conn.cursor() as cur: cur.execute(get_columns_sql('cats')) columns = cur.fetchall() assert set(columns) == { ('_sdc_batched_at', 'timestamp with time zone', 'YES'), ('_sdc_received_at', 'timestamp with time zone', 'YES'), ('_sdc_sequence', 'bigint', 'YES'), ('_sdc_table_version', 'bigint', 'YES'), ('adoption__adopted_on', 'timestamp with time zone', 'YES'), ('adoption__was_foster', 'boolean', 'YES'), ('age', 'bigint', 'YES'), ('id', 'bigint', 'NO'), ('name', 'text', 'NO'), ('paw_size', 'bigint', 'NO'), ('paw_colour', 'text', 'NO'), ('flea_check_complete', 'boolean', 'NO'), ('pattern', 'text', 'YES') } cur.execute( sql.SQL('SELECT {} FROM {}').format(sql.Identifier('name'), sql.Identifier('cats'))) persisted_records = cur.fetchall() ## Assert that the original data is present assert cat_count == len(persisted_records) assert cat_count == len( [x for x in persisted_records if x[0] is not None]) class NameNullCatStream(CatStream): def generate_record(self): record = CatStream.generate_record(self) record['id'] = record['id'] + cat_count record['name'] = None return record stream = NameNullCatStream(cat_count) stream.schema = deepcopy(stream.schema) stream.schema['schema']['properties']['name'] = json_schema.make_nullable( stream.schema['schema']['properties']['name']) main(CONFIG, input_stream=stream) with psycopg2.connect(**TEST_DB) as conn: with conn.cursor() as cur: cur.execute(get_columns_sql('cats')) columns = cur.fetchall() assert set(columns) == { ('_sdc_batched_at', 'timestamp with time zone', 'YES'), ('_sdc_received_at', 'timestamp with time zone', 'YES'), ('_sdc_sequence', 'bigint', 'YES'), ('_sdc_table_version', 'bigint', 'YES'), ('adoption__adopted_on', 'timestamp with time zone', 'YES'), ('adoption__was_foster', 'boolean', 'YES'), ('age', 'bigint', 'YES'), ('id', 'bigint', 'NO'), ('name', 'text', 'YES'), ('paw_size', 'bigint', 'NO'), ('paw_colour', 'text', 'NO'), ('flea_check_complete', 'boolean', 'NO'), ('pattern', 'text', 'YES') } cur.execute( sql.SQL('SELECT {} FROM {}').format(sql.Identifier('name'), sql.Identifier('cats'))) persisted_records = cur.fetchall() ## Assert that the column is has migrated data assert 2 * cat_count == len(persisted_records) assert cat_count == len( [x for x in persisted_records if x[0] is not None]) assert cat_count == len( [x for x in persisted_records if x[0] is None]) class NameNonNullCatStream(CatStream): def generate_record(self): record = CatStream.generate_record(self) record['id'] = record['id'] + 2 * cat_count return record main(CONFIG, input_stream=NameNonNullCatStream(cat_count)) with psycopg2.connect(**TEST_DB) as conn: with conn.cursor() as cur: cur.execute(get_columns_sql('cats')) columns = cur.fetchall() assert set(columns) == { ('_sdc_batched_at', 'timestamp with time zone', 'YES'), ('_sdc_received_at', 'timestamp with time zone', 'YES'), ('_sdc_sequence', 'bigint', 'YES'), ('_sdc_table_version', 'bigint', 'YES'), ('adoption__adopted_on', 'timestamp with time zone', 'YES'), ('adoption__was_foster', 'boolean', 'YES'), ('age', 'bigint', 'YES'), ('id', 'bigint', 'NO'), ('name', 'text', 'YES'), ('paw_size', 'bigint', 'NO'), ('paw_colour', 'text', 'NO'), ('flea_check_complete', 'boolean', 'NO'), ('pattern', 'text', 'YES') } cur.execute( sql.SQL('SELECT {} FROM {}').format(sql.Identifier('name'), sql.Identifier('cats'))) persisted_records = cur.fetchall() ## Assert that the column is has migrated data assert 3 * cat_count == len(persisted_records) assert 2 * cat_count == len( [x for x in persisted_records if x[0] is not None]) assert cat_count == len( [x for x in persisted_records if x[0] is None])