def produce_collection_schema(collection): collection_name = collection.name collection_db_name = collection.database.name mdata = {} mdata = metadata.write(mdata, (), 'database-name', collection_db_name) mdata = metadata.write(mdata, (), 'row-count', collection.estimated_document_count()) # Get indexes coll_indexes = collection.index_information() if coll_indexes.get('_id_'): mdata = metadata.write(mdata, (), 'table-key-properties', ['_id']) # If _id is in indexes, write `table-key-properties = ['_id']` metadata # If _id isn't present, look for indexes with unique=True, and write that as table-key-properties # Look for any indexes that aren't _id, and write them as 'valid-replication-key=[]' metadata return { 'table_name': collection_name, 'stream': collection_name, 'metadata': metadata.to_list(mdata), 'tap_stream_id': "{}-{}".format(collection_db_name, collection_name), 'schema': { 'type': 'object' } }
def select_all_fields_except(self, blacklisted_fields, schema, md): md = metadata.write(md, (), 'selected', True) for p in schema['properties'].keys(): if p not in blacklisted_fields and p != 'day': md = metadata.write(md, ('properties', p), 'selected', True) return md
def get_schemas(config, config_path): schemas = {} schemas_metadata = {} streams = get_streams(config, config_path) LOGGER.info('There are {:d} valid streams in MS Dynamics'.format( len(streams))) for stream_name, stream_object in streams.items(): schema = stream_object.schema meta = metadata.get_standard_metadata( schema=schema, key_properties=stream_object.key_properties, replication_method=stream_object.replication_method) meta = metadata.to_map(meta) if stream_object.valid_replication_keys: meta = metadata.write(meta, (), 'valid-replication-keys', stream_object.valid_replication_keys) if stream_object.replication_key: meta = metadata.write( meta, ('properties', stream_object.replication_key), 'inclusion', 'automatic') meta = metadata.to_list(meta) schemas[stream_name] = schema schemas_metadata[stream_name] = meta return schemas, schemas_metadata
def get_schemas(config): schemas = {} schemas_metadata = {} client = S3Client(config['aws_access_key_id'], config['aws_secret_access_key']) for tap_stream_id, table_spec in config['tables'].items(): LOGGER.info(f'Starting discovery for {tap_stream_id}') stream_object = Stream(client, table_spec, None) stream_schema = stream_object.get_schema() meta = metadata.get_standard_metadata( schema=stream_schema, key_properties=stream_object.key_properties, replication_method=stream_object.replication_method ) meta = metadata.to_map(meta) if stream_object.valid_replication_keys: meta = metadata.write(meta, (), 'valid-replication-keys', stream_object.valid_replication_keys) if stream_object.replication_key: meta = metadata.write(meta, ('properties', stream_object.replication_key), 'inclusion', 'automatic') meta = metadata.to_list(meta) schemas[tap_stream_id] = stream_schema schemas_metadata[tap_stream_id] = meta return schemas, schemas_metadata
def discover(ctx): check_credentials_are_authorized(ctx) catalog = Catalog([]) for tap_stream_id in schemas.stream_ids: schema_dict = schemas.load_schema(tap_stream_id) schema = Schema.from_dict(schema_dict) mdata = metadata.get_standard_metadata( schema_dict, key_properties=schemas.PK_FIELDS[tap_stream_id]) mdata = metadata.to_map(mdata) # NB: `lists` and `messages` are required for their substreams. # This is an approximation of the initial functionality using # metadata, which marked them as `selected=True` in the schema. if tap_stream_id in ['lists', 'messages']: mdata = metadata.write(mdata, (), 'inclusion', 'automatic') for field_name in schema_dict['properties'].keys(): mdata = metadata.write(mdata, ('properties', field_name), 'inclusion', 'automatic') catalog.streams.append( CatalogEntry(stream=tap_stream_id, tap_stream_id=tap_stream_id, key_properties=schemas.PK_FIELDS[tap_stream_id], schema=schema, metadata=metadata.to_list(mdata))) return catalog
def discover(): raw_schemas = load_schemas() streams = [] for stream_id, schema in raw_schemas.items(): # TODO: populate any metadata and stream's key properties here.. mock_mdata = metadata.get_standard_metadata(schema.to_dict()) metadata.write(metadata.to_map(mock_mdata), (), "selected", True) mock_keyprops = ['id'] stream_metadata = mock_mdata key_properties = mock_keyprops streams.append( CatalogEntry( tap_stream_id=stream_id, stream=stream_id, schema=schema, key_properties=key_properties, metadata=stream_metadata, replication_key=None, is_view=None, database=None, table=None, row_count=None, stream_alias=None, replication_method=None, )) return Catalog(streams)
def discover(): ''' Run discovery mode ''' streams = [] for stream_id, stream_object in STREAMS.items(): raw_schema = load_schema(stream_id) schema = Schema.from_dict(raw_schema) mdata = metadata.to_map( metadata.get_standard_metadata( schema=raw_schema, schema_name=stream_id, key_properties=stream_object.key_properties, valid_replication_keys=[stream_object.replication_key], replication_method=stream_object.replication_method)) # make sure that the replication key field is mandatory if stream_object.replication_key: metadata.write(mdata, ('properties', stream_object.replication_key), 'inclusion', 'automatic') streams.append( CatalogEntry(stream=stream_id, tap_stream_id=stream_id, key_properties=stream_object.key_properties, schema=schema, metadata=metadata.to_list(mdata))) return Catalog(streams)
def discover(): """ Allow discovery of all streams and metadata """ raw_schemas = load_schemas() streams = [] for schema_name, schema in raw_schemas.items(): mdata = metadata.new() mdata = metadata.write(mdata, (), 'table-key-properties', ['id']) mdata = metadata.write(mdata, ('properties', 'id'), 'inclusion', 'automatic') mdata = metadata.write(mdata, (), 'valid-replication-keys', ['updated_at']) mdata = metadata.write(mdata, ('properties', 'updated_at'), 'inclusion', 'automatic') for field_name in schema['properties'].keys(): if field_name not in {'id', 'updated_at'}: mdata = metadata.write(mdata, ('properties', field_name), 'inclusion', 'available') # create and add catalog entry catalog_entry = { 'stream': schema_name, 'tap_stream_id': schema_name, 'schema': schema, 'metadata': metadata.to_list(mdata), 'key_properties': ['id'] } streams.append(catalog_entry) return {'streams': streams}
def do_discover(): raw_schemas = _load_schemas() catalog_entries = [] for stream_name, schema in raw_schemas.items(): stream = STREAM_OBJECTS[stream_name] mdata = metadata.get_standard_metadata( schema=schema, key_properties=stream.key_properties, replication_method=stream.replication_method ) mdata = metadata.to_map(mdata) if stream.replication_key: mdata = metadata.write(mdata, (), 'valid-replication-keys', [stream.replication_key]) for field_name in schema['properties'].keys(): if field_name in stream.key_properties or field_name == stream.replication_key: mdata = metadata.write(mdata, ('properties', field_name), 'inclusion', 'automatic') else: mdata = metadata.write(mdata, ('properties', field_name), 'inclusion', 'available') catalog_entry = { 'stream': stream_name, 'tap_stream_id': stream_name, 'schema': schema, 'metadata': metadata.to_list(mdata), 'key_properties': stream.key_properties} catalog_entries.append(catalog_entry) return Catalog.from_dict({'streams': catalog_entries})
def get_metadata(schema, key_properties, replication_method, replication_key): mdata = metadata.new() mdata = metadata.write(mdata, (), 'table-key-properties', key_properties) mdata = metadata.write(mdata, (), 'forced-replication-method', replication_method) if replication_key: mdata = metadata.write(mdata, (), 'valid-replication-keys', [replication_key]) for field_name in schema['properties'].keys(): if field_name in key_properties \ or field_name in [replication_key, "updated"]: mdata = metadata.write(mdata, ('properties', field_name), 'inclusion', 'automatic') else: mdata = metadata.write(mdata, ('properties', field_name), 'inclusion', 'available') return metadata.to_list(mdata)
def discover_catalog(name, automatic_inclusion, **kwargs): unsupported = kwargs.get("unsupported", frozenset([])) stream_automatic_inclusion = kwargs.get("stream_automatic_inclusion", False) root = os.path.dirname(os.path.realpath(__file__)) path = os.path.join(root, 'schemas/{}.json'.format(name)) mdata = metadata.new() with open(path, "r") as f: discovered_schema = json.load(f) for field in discovered_schema["schema"]["properties"]: if field in automatic_inclusion: mdata = metadata.write(mdata, ('properties', field), 'inclusion', 'automatic') elif field in unsupported: mdata = metadata.write(mdata, ('properties', field), 'inclusion', 'unsupported') else: mdata = metadata.write(mdata, ('properties', field), 'inclusion', 'available') if stream_automatic_inclusion: mdata = metadata.write(mdata, (), 'inclusion', 'automatic') discovered_schema["metadata"] = metadata.to_list(mdata) return discovered_schema
def get_schema_for_type(typ, breadcrumb, mdata, null=False): # http://developers.marketo.com/rest-api/lead-database/fields/field-types/ if typ in ['datetime', 'date']: rtn = {"type": "string", "format": "date-time"} elif typ in ['integer', 'percent', 'score']: rtn = {'type': 'integer'} elif typ in ['float', 'currency']: rtn = {'type': 'number'} elif typ == 'boolean': rtn = {'type': 'boolean'} elif typ in STRING_TYPES: rtn = {'type': 'string'} elif typ in ['array']: rtn = { 'type': 'array', 'items': { 'type': ['integer', 'number', 'string', 'null'] } } else: rtn = {'type': 'string'} if null: rtn["type"] = [rtn["type"], "null"] rtn["inclusion"] = "available" mdata = metadata.write(mdata, breadcrumb, 'inclusion', 'available') else: rtn["inclusion"] = "automatic" mdata = metadata.write(mdata, breadcrumb, 'inclusion', 'automatic') return rtn, mdata
def do_discover(): raw_schemas = _load_schemas() catalog_entries = [] for stream_name, schema in raw_schemas.items(): # create and add catalog entry stream = STREAM_OBJECTS[stream_name] mdata = metadata.get_standard_metadata( schema=schema, key_properties=stream.key_properties, valid_replication_keys=stream.replication_keys, replication_method=stream.replication_method, ) mdata = metadata.to_map(mdata) for field_name in stream.replication_keys: metadata.write(mdata, ('properties', field_name), 'inclusion', 'automatic') catalog_entry = { "stream": stream_name, "tap_stream_id": stream_name, "schema": schema, "metadata": metadata.to_list(mdata), "key_properties": stream.key_properties, } catalog_entries.append(catalog_entry) return Catalog.from_dict({"streams": catalog_entries})
def _populate_metadata(schema_name: str, schema: Dict) -> Dict: """ Populates initial metadata for each field in a schema. Args: schema_name: The schema name to generate metadata for e.g. 'general_ledger_accounts'. schema: The corresponding JSON schema. Returns: Metadata dictionary for the selected stream. Fields are disabled by default. """ mdata = metadata.new() mdata = metadata.write(mdata, (), 'table-key-properties', KEY_PROPERTIES[schema_name]) mdata = metadata.write(mdata, (), 'selected', False) for field_name in schema['properties']: if field_name in KEY_PROPERTIES[schema_name]: mdata = metadata.write(mdata, ('properties', field_name), 'inclusion', 'automatic') else: mdata = metadata.write(mdata, ('properties', field_name), 'inclusion', 'available') mdata = metadata.write(mdata, ('properties', field_name), 'selected', False) return mdata
def discover(ctx): check_credentials_are_authorized(ctx) catalog = Catalog([]) for stream in streams.STREAMS: schema = Schema.from_dict(streams.load_schema(stream.tap_stream_id), inclusion="available") mdata = metadata.new() for prop in schema.properties: if prop in streams.PK_FIELDS[stream.tap_stream_id]: mdata = metadata.write(mdata, ('properties', prop), 'inclusion', 'automatic') else: mdata = metadata.write(mdata, ('properties', prop), 'inclusion', 'available') catalog.streams.append( CatalogEntry( stream=stream.tap_stream_id, tap_stream_id=stream.tap_stream_id, key_properties=streams.PK_FIELDS[stream.tap_stream_id], schema=schema, metadata=metadata.to_list(mdata))) return catalog
def get_metadata(self): keys = self.schema.get('properties').keys() self.key_properties = [k for k in keys if 'date' in k] mdata = metadata.new() mdata = metadata.write(mdata, (), 'table-key-properties', self.key_properties) mdata = metadata.write(mdata, (), 'forced-replication-method', 'INCREMENTAL') for field_name in keys: if field_name in self.key_properties: mdata = metadata.write(mdata, ('properties', field_name), 'inclusion', 'automatic') else: mdata = metadata.write(mdata, ('properties', field_name), 'inclusion', 'available') mdata = metadata.write(mdata, ('properties', field_name), 'selected-by-default', True) return metadata.to_list(mdata)
def discover_table_schema(client, table_name): try: table_info = client.describe_table(TableName=table_name).get( 'Table', {}) except ClientError: LOGGER.critical( "Authorization to AWS failed. Please ensure the role and policy are configured correctly on your AWS account." ) return None else: # write stream metadata mdata = {} key_props = [ key_schema.get('AttributeName') for key_schema in table_info.get('KeySchema', []) ] mdata = metadata.write(mdata, (), 'table-key-properties', key_props) if table_info.get('ItemCount'): mdata = metadata.write(mdata, (), 'row-count', table_info['ItemCount']) return { 'table_name': table_name, 'stream': table_name, 'tap_stream_id': table_name, 'metadata': metadata.to_list(mdata), 'schema': { 'type': 'object' } }
def load_metadata(self): schema = self.load_schema() mdata = metadata.new() mdata = metadata.write(mdata, (), 'table-key-properties', self.key_properties) mdata = metadata.write(mdata, (), 'forced-replication-method', self.replication_method) if self.replication_key: mdata = metadata.write(mdata, (), 'valid-replication-keys', [self.replication_key]) for field_name in schema['properties'].keys(): if field_name in self.key_properties or field_name == self.replication_key: mdata = metadata.write(mdata, ('properties', field_name), 'inclusion', 'automatic') else: mdata = metadata.write(mdata, ('properties', field_name), 'inclusion', 'available') # For period stream adjust schema for time period if hasattr(self, 'period') and self.period == 'hourRange': mdata.pop(('properties', 'day')) elif hasattr(self, 'period') and self.period == 'dayRange': mdata.pop(('properties', 'hour')) return metadata.to_list(mdata)
def get_schemas(): schemas = {} schemas_metadata = {} for stream_name, stream_object in STREAMS.items(): schema_path = get_abs_path('schemas/{}.json'.format(stream_name)) with open(schema_path) as file: schema = json.load(file) meta = metadata.get_standard_metadata( schema=schema, key_properties=stream_object.key_properties, replication_method=stream_object.replication_method) meta = metadata.to_map(meta) if stream_object.valid_replication_keys: meta = metadata.write(meta, (), 'valid-replication-keys', stream_object.valid_replication_keys) if stream_object.replication_key: meta = metadata.write( meta, ('properties', stream_object.replication_key), 'inclusion', 'automatic') meta = metadata.to_list(meta) schemas[stream_name] = schema schemas_metadata[stream_name] = meta return schemas, schemas_metadata
def produce_collection_schema(collection: Collection) -> Dict: """ Generate a schema/catalog from the collection details for discovery mode Args: collection: stream Collection Returns: collection catalog """ collection_name = collection.name collection_db_name = collection.database.name is_view = collection.options().get('viewOn') is not None mdata = {} mdata = metadata.write(mdata, (), 'table-key-properties', ['_id']) mdata = metadata.write(mdata, (), 'database-name', collection_db_name) mdata = metadata.write(mdata, (), 'row-count', collection.estimated_document_count()) mdata = metadata.write(mdata, (), 'is-view', is_view) # write valid-replication-key metadata by finding fields that have indexes on them. # cannot get indexes for views -- NB: This means no key-based incremental for views? if not is_view: valid_replication_keys = [] coll_indexes = collection.index_information() # index_information() returns a map of index_name -> index_information for _, index_info in coll_indexes.items(): # we don't support compound indexes if len(index_info.get('key')) == 1: index_field_info = index_info.get('key')[0] # index_field_info is a tuple of (field_name, sort_direction) if index_field_info: valid_replication_keys.append(index_field_info[0]) if valid_replication_keys: mdata = metadata.write(mdata, (), 'valid-replication-keys', valid_replication_keys) return { 'table_name': collection_name, 'stream': collection_name, 'metadata': metadata.to_list(mdata), 'tap_stream_id': "{}-{}".format(collection_db_name, collection_name), 'schema': { 'type': 'object', 'properties': { "_id": { "type": ["string", "null"] }, "document": { "type": ["object", "array", "string", "null"] }, "_sdc_deleted_at": { "type": ["string", "null"] }, }, } }
def test_build_field_list_include_datecreated(self): singer_metadata.write(self.metadata, ('properties', 'datecreated'), 'selected', True) field_list, ids_to_breadcrumbs = tap_quickbase.build_field_lists( self.schema, self.metadata, []) self.assertEqual(2, len(field_list)) self.assertEqual(['properties', 'datecreated'], ids_to_breadcrumbs['1'])
def write_sql_data_type_md(mdata, col_info): c_name = col_info.column_name if col_info.sql_data_type == 'bit' and col_info.character_maximum_length > 1: mdata = metadata.write(mdata, ('properties', c_name), 'sql-datatype', "bit({})".format(col_info.character_maximum_length)) else: mdata = metadata.write(mdata, ('properties', c_name), 'sql-datatype', col_info.sql_data_type) return mdata
def generate_metadata(schema): mdata = metadata.new() mdata = metadata.write(mdata, (), 'table-key-properties', ['id']) for field_name, props in schema['properties'].items(): mdata = metadata.write(mdata, ('properties', field_name), 'inclusion', 'automatic') return metadata.to_list(mdata)
def field_to_property_schema(field, mdata): property_schema = {} field_name = field['name'] sf_type = field['type'] if sf_type in STRING_TYPES: property_schema['type'] = "string" elif sf_type in DATE_TYPES: date_type = {"type": "string", "format": "date-time"} string_type = {"type": ["string", "null"]} property_schema["anyOf"] = [date_type, string_type] elif sf_type == "boolean": property_schema['type'] = "boolean" elif sf_type in NUMBER_TYPES: property_schema['type'] = "number" elif sf_type == "address": property_schema['type'] = "object" property_schema['properties'] = { "street": {"type": ["null", "string"]}, "state": {"type": ["null", "string"]}, "postalCode": {"type": ["null", "string"]}, "city": {"type": ["null", "string"]}, "country": {"type": ["null", "string"]}, "longitude": {"type": ["null", "number"]}, "latitude": {"type": ["null", "number"]}, "geocodeAccuracy": {"type": ["null", "string"]} } elif sf_type == "int": property_schema['type'] = "integer" elif sf_type == "time": property_schema['type'] = "string" elif sf_type in LOOSE_TYPES: return property_schema, mdata # No type = all types elif sf_type in BINARY_TYPES: mdata = metadata.write(mdata, ('properties', field_name), "inclusion", "unsupported") mdata = metadata.write(mdata, ('properties', field_name), "unsupported-description", "binary data") return property_schema, mdata elif sf_type == 'location': # geo coordinates are numbers or objects divided into two fields for lat/long property_schema['type'] = ["number", "object", "null"] property_schema['properties'] = { "longitude": {"type": ["null", "number"]}, "latitude": {"type": ["null", "number"]} } elif sf_type == 'json': property_schema['type'] = "string" else: raise TapSalesforceException("Found unsupported type: {}".format(sf_type)) # The nillable field cannot be trusted if field_name != 'Id' and sf_type != 'location' and sf_type not in DATE_TYPES: property_schema['type'] = ["null", property_schema['type']] return property_schema, mdata
def generate_base_metadata(all_cubes, schema): mdata = metadata.get_standard_metadata(schema=schema, key_properties=["_sdc_record_hash"]) mdata = metadata.to_map(mdata) mdata = metadata.write(mdata, (), "tap_google_analytics.all_cubes", list(all_cubes)) mdata = reduce(lambda mdata, field_name: metadata.write(mdata, ("properties", field_name), "inclusion", "automatic"), ["_sdc_record_hash", "start_date", "end_date", "account_id", "web_property_id", "profile_id"], mdata) mdata = reduce(lambda mdata, field_name: metadata.write(mdata, ("properties", field_name), "tap_google_analytics.group", "Report Fields"), ["_sdc_record_hash", "start_date", "end_date", "account_id", "web_property_id", "profile_id"], mdata) return mdata
def load_metadata(schema): mdata = metadata.new() for field_name in schema.get('properties', {}).keys(): mdata = metadata.write(mdata, ('properties', field_name), 'inclusion', 'automatic') if field_name == "RECORDNO": mdata = metadata.write(mdata, (), 'table-key-properties', "RECORDNO") return metadata.to_list(mdata)
def create_column_metadata(cols): mdata = {} mdata = metadata.write(mdata, (), 'selected-by-default', False) for col in cols: schema = schema_for_column(col) mdata = metadata.write(mdata, ('properties', col.column_name), 'selected-by-default', schema.inclusion != 'unsupported') mdata = metadata.write(mdata, ('properties', col.column_name), 'sql-datatype', col.column_type.lower()) return metadata.to_list(mdata)
def generate_metadata(schema_name, schema): pk_fields = SCHEMA_PRIMARY_KEYS[schema_name] mdata = metadata.new() mdata = metadata.write(mdata, (), 'table-key-properties', pk_fields) for field_name in schema['properties'].keys(): if field_name in pk_fields: mdata = metadata.write(mdata, ('properties', field_name), 'inclusion', 'automatic') else: mdata = metadata.write(mdata, ('properties', field_name), 'inclusion', 'available') return metadata.to_list(mdata)
def load_metadata(table_spec, schema): mdata = metadata.new() mdata = metadata.write(mdata, (), 'table-key-properties', table_spec['key_properties']) for field_name in schema.get('properties', {}).keys(): if table_spec.get('key_properties', []) and field_name in table_spec.get('key_properties', []): mdata = metadata.write(mdata, ('properties', field_name), 'inclusion', 'automatic') else: mdata = metadata.write(mdata, ('properties', field_name), 'inclusion', 'available') return metadata.to_list(mdata)
def default_streams(config): schema = config["schema"] primary_keys = config.get("primary_keys", []) mdata = {} metadata.write(mdata, (), "table-key-properties", primary_keys) return [{ "tap_stream_id": config["topic"], "metadata": metadata.to_list(mdata), "schema": schema }]