def generate_catalog(client, report_config, standard_fields, custom_fields, all_cubes, cubes_lookup, profile_ids): """ Generate a catalog entry for each report specified in `report_config` """ catalog_entries = [] # for report in PREMADE_REPORTS: for report in report_config: # change to safe name for bigquery temp = report['name'].replace(' ', '_').lower() report['name'] = temp metrics_dimensions = set(report['metrics'] + report['dimensions']) selected_by_default = { *report['metrics'][:10], # Use first 10 metrics in definition *report.get('default_dimensions', []) } premade_fields = [ field for field in standard_fields if field['id'] in metrics_dimensions ] schema, mdata = generate_premade_catalog_entry(premade_fields, all_cubes, cubes_lookup) mdata = reduce( lambda mdata, field_name: metadata.write(mdata, ( "properties", field_name), "selected-by-default", True), selected_by_default, mdata) catalog_entries.append( CatalogEntry(schema=Schema.from_dict(schema), key_properties=['_sdc_record_hash'], stream=report['name'], tap_stream_id=report['name'], metadata=metadata.to_list(mdata))) # for report in report_config: for report in []: schema, mdata = generate_catalog_entry(client, standard_fields, custom_fields, all_cubes, cubes_lookup, profile_ids) catalog_entries.append( CatalogEntry(schema=Schema.from_dict(schema), key_properties=['_sdc_record_hash'], stream=report['name'], tap_stream_id=report['id'], metadata=metadata.to_list(mdata))) return Catalog(catalog_entries)
def generate_catalog(client, standard_fields, custom_fields, all_cubes, cubes_lookup, profile_id): schema, mdata = generate_catalog_entry(client, standard_fields, custom_fields, all_cubes, cubes_lookup, profile_id) # Do the thing to generate the thing catalog_entry = CatalogEntry(schema=Schema.from_dict(schema), key_properties=['_sdc_record_hash'], stream='report', tap_stream_id='report', metadata=metadata.to_list(mdata)) return Catalog([catalog_entry])
def test_binlog_stream_requires_historical_with_no_log_coordinates_returns_true( self): catalog = CatalogEntry(tap_stream_id='stream_1', schema={}) state = { 'bookmarks': { 'stream_1': {}, 'stream_2': {}, } } self.assertTrue(binlog_stream_requires_historical(catalog, state))
def test_binlog_stream_requires_historical_with_gtid_returns_false(self): catalog = CatalogEntry(tap_stream_id='stream_1', schema={}) state = { 'bookmarks': { 'stream_1': { 'gtid': '0-3834-222' }, 'stream_2': {}, } } self.assertFalse(binlog_stream_requires_historical(catalog, state))
def test_binlog_stream_requires_historical_with_log_coordinates_returns_false( self): catalog = CatalogEntry(tap_stream_id='stream_1', schema={}) state = { 'bookmarks': { 'stream_1': { 'log_file': 'binlog.0001', 'log_pos': 1123 }, 'stream_2': {}, } } self.assertFalse(binlog_stream_requires_historical(catalog, state))
def generate_streams(conn, table_info): entries = [] for schema_name in table_info.keys(): for table in table_info[schema_name].keys(): with conn.cursor() as cur: sql = f""" SELECT COLUMN_NAME FROM INFORMATION_SCHEMA.KEY_COLUMN_USAGE AS kcu INNER JOIN INFORMATION_SCHEMA.TABLE_CONSTRAINTS AS tc ON tc.CONSTRAINT_NAME = kcu.CONSTRAINT_NAME AND tc.CONSTRAINT_TYPE = 'PRIMARY KEY' WHERE kcu.TABLE_SCHEMA = '{schema_name}' AND kcu.TABLE_NAME = '{table}'""" cur.execute(sql) table_pks = [ col['COLUMN_NAME'] for col in convert_result_to_dict(cur) ] sql = """SELECT db_name()""" cur.execute(sql) database = cur.fetchone()[0] meta = {} columns = table_info[schema_name][table]['columns'] metadata.write(meta, (), 'table-key-properties', table_pks) metadata.write(meta, (), 'schema-name', schema_name) metadata.write(meta, (), 'database-name', database) metadata.write(meta, (), 'row-count', table_info[schema_name][table]['row_count']) metadata.write(meta, (), 'is-view', table_info[schema_name][table]['is_view']) column_schemas = { col_name: schema_for_column(col_info, table_pks) for col_name, col_info in columns.items() } schema = Schema(type=object, properties=column_schemas) entry = CatalogEntry(table=table, stream=table, metadata=metadata.to_list(meta), tap_stream_id=get_tap_stream_id( database, schema_name, table), schema=schema) entries.append(entry) return Catalog(entries)
def test_binlog_stream_requires_historical_with_log_coordinates_and_last_pk_value_returns_true( self): catalog = CatalogEntry(tap_stream_id='stream_1', schema={}) state = { 'bookmarks': { 'stream_1': { 'log_file': 'binlog.0001', 'log_pos': 1123, 'last_pk_fetched': '111' }, 'stream_2': {}, } } self.assertTrue(binlog_stream_requires_historical(catalog, state))
def get_catalog_entry(self, swagger: JsonResult) -> CatalogEntry: schema = self._map_to_schema(swagger) stream_metadata = metadata.get_standard_metadata( schema=schema.to_dict(), key_properties=self.key_properties, valid_replication_keys=[self.replication_key] if self.replication_key else None, ) catalog_entry = CatalogEntry( tap_stream_id=self.stream_id, stream=self.stream_id, schema=schema, key_properties=self.key_properties, metadata=stream_metadata, replication_key=self.replication_key, replication_method=self.replication_method.name if self.replication_method else None, ) return catalog_entry
def generate_catalog( client, report_config, standard_fields, custom_fields, all_cubes, cubes_lookup, profile_ids, ): """ Generate a catalog entry for each report specified in `report_config` """ catalog_entries = [] for report in report_config: selected_by_default = { *report['metrics'][:10], *report.get('dimensions', []) } schema, mdata = generate_catalog_entry(client, standard_fields, custom_fields, all_cubes, cubes_lookup, profile_ids) mdata = reduce( lambda mdata, field_name: metadata.write(mdata, ( "properties", field_name), "selected-by-default", True), selected_by_default, mdata) catalog_entries.append( CatalogEntry( schema=Schema.from_dict(schema), key_properties=['_sdc_record_hash'], stream=report['name'], tap_stream_id=report['name'], metadata=metadata.to_list(mdata), )) return Catalog(catalog_entries)
class TestValidateDependencies(unittest.TestCase): catalog = Catalog([ CatalogEntry(tap_stream_id='boards', schema=Schema(), metadata=[{ 'metadata': { 'selected': False }, 'breadcrumb': [] }]), CatalogEntry(tap_stream_id='issue_board', schema=Schema(), metadata=[{ 'metadata': { 'selected': True }, 'breadcrumb': [] }]), CatalogEntry(tap_stream_id='project_board', schema=Schema(), metadata=[{ 'metadata': { 'selected': False }, 'breadcrumb': [] }]), CatalogEntry(tap_stream_id='epics', schema=Schema(), metadata=[{ 'metadata': { 'selected': False }, 'breadcrumb': [] }]), CatalogEntry(tap_stream_id='sprints', schema=Schema(), metadata=[{ 'metadata': { 'selected': True }, 'breadcrumb': [] }]), CatalogEntry(tap_stream_id='issue_comments', schema=Schema(), metadata=[{ 'metadata': { 'selected': True }, 'breadcrumb': [] }]), ]) def test_is_selected(self): selected = utils.is_selected(streams.IssueBoard, self.catalog) self.assertTrue(selected) def test_raises_substream_error(self): test_streams = {'boards': streams.STREAMS['boards']} # test recursive checking test_streams['boards']['substreams']['issues'] = streams.STREAMS[ 'issues'] self.assertRaises(utils.DependencyException, utils.validate_dependencies, test_streams, self.catalog) def test_raises_right_amount_of_substream_errors(self): test_streams = {'boards': streams.STREAMS['boards']} # test recursive checking test_streams['boards']['substreams']['issues'] = streams.STREAMS[ 'issues'] with self.assertRaises(utils.DependencyException) as context: utils.validate_dependencies(test_streams, self.catalog) self.assertTrue(len(context.exception.errors) == 3)