def run_discovery(cls, args): cls.__apply_config(args.config) if "base_id" in args.config: base_id = args.config['base_id'] entries = cls.discover_base(base_id) return Catalog(entries).dump() bases = cls.__get_base_ids() entries = [] for base in bases: entries.extend(cls.discover_base(base["id"], base["name"])) return Catalog(entries).dump()
def discover(detect=True): if detect: raw_schemas = [] for schema_name in ldap_core.SCHEMA_NAMES: raw_schemas.append(ldap_core.detect_schema(schema_name)) else: raw_schemas = catalog_spec.load_schemas() streams = [] for schema_name, schema in raw_schemas.items(): # TODO: populate any metadata and stream's key properties here.. stream_metadata = [] key_properties = [] streams.append( CatalogEntry( tap_stream_id=schema_name, stream=schema_name, schema=schema, key_properties=key_properties, metadata=stream_metadata, replication_key=None, is_view=None, database=None, table=None, row_count=None, stream_alias=None, replication_method=None, ) ) return Catalog(streams)
def discover(): catalog = Catalog([]) for tap_stream_id in schemas.STATIC_SCHEMA_STREAM_IDS: # print("tap stream id=",tap_stream_id) schema = Schema.from_dict(schemas.load_schema(tap_stream_id)) metadata = [] for field_name in schema.properties.keys(): # print("field name=",field_name) if field_name in schemas.PK_FIELDS[tap_stream_id]: inclusion = "automatic" else: inclusion = "available" metadata.append({ "metadata": { "inclusion": inclusion }, "breadcrumb": ["properties", field_name], }) catalog.streams.append( CatalogEntry( stream=tap_stream_id, tap_stream_id=tap_stream_id, key_properties=schemas.PK_FIELDS[tap_stream_id], schema=schema, metadata=metadata, )) return catalog
def discover(client): schemas, field_metadata = get_schemas() catalog = Catalog([]) for stream_name, schema_dict in schemas.items(): schema = Schema.from_dict(schema_dict) pk = get_pk(stream_name) metadata = field_metadata[stream_name] catalog.streams.append( CatalogEntry(stream=stream_name, tap_stream_id=stream_name, key_properties=pk, schema=schema, metadata=metadata)) for fn in [_records_streams, _partner_records_streams]: singer_streams = _convert_to_singer_streams(fn(client)) for stream_name, data in singer_streams.items(): schema = Schema.from_dict(data['schema']) metadata = data['metadata'] catalog.streams.append( CatalogEntry(stream=stream_name, tap_stream_id=stream_name, key_properties=PRIMARY_KEYS[stream_name], schema=schema, metadata=metadata)) return catalog
def discover(config): streams = [] schemas = generate_schemas(config) for stream_id, schema in schemas.items(): stream_metadata = [] key_properties = [] streams.append( CatalogEntry( tap_stream_id=stream_id, stream=stream_id, schema=schema, key_properties=key_properties, metadata=stream_metadata, replication_key=None, is_view=None, database=None, table=None, row_count=None, stream_alias=None, replication_method=None, ) ) return Catalog(streams)
def discover(): raw_schemas = load_schemas() streams = [] for stream_id, schema in raw_schemas.items(): key_properties = ['gradable_id'] if stream_id == 'section_scores': key_properties.append('section_id') stream_metadata = metadata.get_standard_metadata( schema=schema.to_dict(), key_properties=key_properties, valid_replication_keys='date_graded', replication_method=None) streams.append( CatalogEntry( tap_stream_id=stream_id, stream=stream_id, schema=schema, key_properties=key_properties, metadata=stream_metadata, replication_key='date_graded', is_view=None, database=None, table=None, row_count=None, stream_alias=None, replication_method=None, )) return Catalog(streams)
def main(): # define required config file keys required_config_keys = ['client_id', 'client_secret', 'start_date'] # check if required keys are in the config file args = singer.parse_args(required_config_keys) # get the input config = args.config catalog = args.catalog or Catalog([]) state = args.state # instatiate the client client = BillwerkClient(config) if args.properties and not args.catalog: raise Exception( "DEPRECATED: Use of the 'properties' parameter is not supported. Please use --catalog instead" ) if args.discover: LOGGER.info("Starting discovery mode") catalog = do_discover() write_catalog(catalog) else: LOGGER.info('Starting sync mode') do_sync(client, config, state, catalog)
def test_should_output_records(self, mock_stdout, requests_mock): requests_mock.get( "https://api.nikabot.com/api/v1/roles?limit=1000&page=0", json=json.loads(ROLES_RESPONSE)) requests_mock.get( "https://api.nikabot.com/api/v1/roles?limit=1000&page=1", json=json.loads(EMPTY_RESPONSE)) config = {"access_token": "my-access-token", "page_size": 1000} state = {} catalog = Catalog(streams=[ CatalogEntry( tap_stream_id="roles", stream="roles", schema=Schema.from_dict({}), key_properties=["id"], metadata=[{ "breadcrumb": [], "metadata": { "selected": True } }], ) ]) sync(config, state, catalog) assert mock_stdout.mock_calls == [ call( '{"type": "SCHEMA", "stream": "roles", "schema": {}, "key_properties": ["id"]}\n' ), call( '{"type": "RECORD", "stream": "roles", "record": {"id": "d893ebf32d49c35c1d754774", "team_id": "T034F9NPW", "name": "0.5"}, "time_extracted": "2020-01-01T00:00:00.000000Z"}\n' ), call( '{"type": "RECORD", "stream": "roles", "record": {"id": "cfabd9aa6f3e6381a716da58", "team_id": "T034F9NPW", "name": "0.1"}, "time_extracted": "2020-01-01T00:00:00.000000Z"}\n' ), ]
def discover(): """ Run discovery mode """ schemas, schemas_metadata = get_schemas() streams = [] for stream_id, schema in schemas.items(): schema_meta = schemas_metadata[stream_id] streams.append( CatalogEntry( tap_stream_id=stream_id, stream=stream_id, schema=schema, key_properties=STREAMS[stream_id]['key_properties'], metadata=schema_meta, replication_key=None, is_view=None, database=None, table=None, row_count=None, stream_alias=None, replication_method=None, )) return Catalog(streams)
def discover(): ''' Run discovery mode ''' streams = [] for stream_id, stream_object in STREAMS.items(): raw_schema = load_schema(stream_id) schema = Schema.from_dict(raw_schema) mdata = metadata.to_map( metadata.get_standard_metadata( schema=raw_schema, schema_name=stream_id, key_properties=stream_object.key_properties, valid_replication_keys=[stream_object.replication_key], replication_method=stream_object.replication_method)) # make sure that the replication key field is mandatory if stream_object.replication_key: metadata.write(mdata, ('properties', stream_object.replication_key), 'inclusion', 'automatic') streams.append( CatalogEntry(stream=stream_id, tap_stream_id=stream_id, key_properties=stream_object.key_properties, schema=schema, metadata=metadata.to_list(mdata))) return Catalog(streams)
def discover(): raw_schemas = load_schemas() streams = [] for stream_id, schema in raw_schemas.items(): stream_metadata = [] key_properties = [] replication_key = "date" replication_method = "INCREMENTAL" streams.append( CatalogEntry( tap_stream_id=stream_id, stream=stream_id, schema=schema, key_properties=key_properties, metadata=stream_metadata, replication_key=replication_key, is_view=None, database=None, table=None, row_count=None, stream_alias=None, replication_method=replication_method, ) ) return Catalog(streams)
def discover(): raw_schemas = load_schemas() streams = [] for stream_id, schema in raw_schemas.items(): # TODO: populate any metadata and stream's key properties here.. stream_metadata = [ { "metadata": { "selected": True, "schema-name": stream_id }, "breadcrumb": [] } ] key_properties = [] streams.append( CatalogEntry( tap_stream_id=stream_id, stream=stream_id, schema=schema, key_properties=key_properties, metadata=stream_metadata, replication_key="currentpage", is_view=None, database=None, table=None, row_count=None, stream_alias=None, replication_method="INCREMENTAL", ) ) return Catalog(streams)
def discover(ctx): check_credentials_are_authorized(ctx) catalog = Catalog([]) for tap_stream_id in schemas.stream_ids: schema_dict = schemas.load_schema(tap_stream_id) schema = Schema.from_dict(schema_dict) mdata = metadata.get_standard_metadata( schema_dict, key_properties=schemas.PK_FIELDS[tap_stream_id]) mdata = metadata.to_map(mdata) # NB: `lists` and `messages` are required for their substreams. # This is an approximation of the initial functionality using # metadata, which marked them as `selected=True` in the schema. if tap_stream_id in ['lists', 'messages']: mdata = metadata.write(mdata, (), 'inclusion', 'automatic') for field_name in schema_dict['properties'].keys(): mdata = metadata.write(mdata, ('properties', field_name), 'inclusion', 'automatic') catalog.streams.append( CatalogEntry(stream=tap_stream_id, tap_stream_id=tap_stream_id, key_properties=schemas.PK_FIELDS[tap_stream_id], schema=schema, metadata=metadata.to_list(mdata))) return catalog
def discover(config): raw_schemas = load_schemas(config) streams = [] for stream_id, schema in raw_schemas.items(): """ Load metadata from metadata folder """ path = get_abs_path(config['metadata_dir']) + '/' + stream_id + '.json' if os.path.isfile(path): with open(path) as file: stream_metadata = json.load(file) key_properties = [] else: # no metadata file adding default empty metadata stream_metadata = [ {"metadata": {"selected": False, "inclusion": "available"}, "breadcrumb": []} ] key_properties = [] streams.append( CatalogEntry( tap_stream_id=stream_id, stream=stream_id, schema=schema, key_properties=key_properties, metadata=stream_metadata, replication_key=None, is_view=None, database=None, table=None, row_count=None, stream_alias=None, replication_method=None, ) ) return Catalog(streams)
def discover(): raw_schemas = load_schemas() streams = [] for stream_id, schema in raw_schemas.items(): key_properties = ['uuid'] replication_key = None if stream_id == 'qa': replication_key = 'sequence_id' stream_metadata = metadata.get_standard_metadata( schema=schema.to_dict(), key_properties=key_properties, valid_replication_keys=replication_key, replication_method=None) streams.append( CatalogEntry( tap_stream_id=stream_id, stream=stream_id, schema=schema, key_properties=key_properties, metadata=stream_metadata, replication_key=replication_key, is_view=None, database=None, table=None, row_count=None, stream_alias=None, replication_method=None, )) return Catalog(streams)
def main(): required_config_keys = ['start_date'] args = singer.parse_args(required_config_keys) validate_config_view_ids(args.config) if "refresh_token" in args.config: # if refresh_token in config assume OAuth2 credentials args.config['auth_method'] = "oauth2" additional_config_keys = [ 'client_id', 'client_secret', 'refresh_token' ] else: # otherwise, assume Service Account details should be present args.config['auth_method'] = "service_account" additional_config_keys = ['client_email', 'private_key'] singer.utils.check_config(args.config, additional_config_keys) config = args.config client = Client(config) catalog = args.catalog or Catalog([]) state = args.state if args.properties and not args.catalog: raise Exception( "DEPRECATED: Use of the 'properties' parameter is not supported. Please use --catalog instead" ) if args.discover: do_discover(client, config) else: do_sync(client, config, catalog, state)
def discover(): raw_schemas = load_schemas() streams = [] for stream_id, schema in raw_schemas.items(): # TODO: populate any metadata and stream's key properties here.. key_properties = STREAM_CONFIGS[stream_id]['key_properties'] stream_metadata = metadata.get_standard_metadata( schema=schema.to_dict(), key_properties=key_properties, # TODO: Verify this works / is necessary valid_replication_keys=['date'], replication_method=None) streams.append( CatalogEntry( tap_stream_id=stream_id, stream=stream_id, schema=schema, key_properties=key_properties, metadata=stream_metadata, replication_key='date', is_view=None, database=None, table=None, row_count=None, stream_alias=None, replication_method=None, )) return Catalog(streams)
def test_should_output_records(self, mock_stdout, requests_mock): requests_mock.get("https://api.nikabot.com/api/v1/groups?limit=1000&page=0", json=json.loads(GROUPS_RESPONSE)) requests_mock.get("https://api.nikabot.com/api/v1/groups?limit=1000&page=1", json=json.loads(EMPTY_RESPONSE)) config = {"access_token": "my-access-token", "page_size": 1000} state = {} catalog = Catalog( streams=[ CatalogEntry( tap_stream_id="groups", stream="groups", schema=Schema.from_dict({}), key_properties=["id"], metadata=[{"breadcrumb": [], "metadata": {"selected": True}}], ) ] ) sync(config, state, catalog) assert mock_stdout.mock_calls == [ call('{"type": "SCHEMA", "stream": "groups", "schema": {}, "key_properties": ["id"]}\n'), call( '{"type": "RECORD", "stream": "groups", "record": {"id": "f1b4b37cc2658672770b789f", "team_id": "T034F9NPW", "name": "TA Squad 5"}, "time_extracted": "2020-01-01T00:00:00.000000Z"}\n' ), call( '{"type": "RECORD", "stream": "groups", "record": {"id": "3176700ac4f2203b825fae6c", "team_id": "T034F9NPW", "name": "Platform Toolkit"}, "time_extracted": "2020-01-01T00:00:00.000000Z"}\n' ), ]
def do_discovery(conn_config): all_streams = [] dbs_to_discover = [] if conn_config.get('dbs_to_discover'): dbs_to_discover = conn_config['dbs_to_discover'] else: with post_db.open_connection(conn_config) as conn: with conn.cursor(cursor_factory=psycopg2.extras.DictCursor) as cur: LOGGER.info( "Fetching all db's, to specify a single db include dbs_to_discover in config.json" ) cur.execute(""" SELECT datname FROM pg_database WHERE datistemplate = false AND CASE WHEN version() LIKE '%Redshift%' THEN true ELSE has_database_privilege(datname,'CONNECT') END = true """) dbs_to_discover = (row[0] for row in cur.fetchall()) for db_row in dbs_to_discover: dbname = db_row LOGGER.info("Discovering db %s", dbname) conn_config['dbname'] = dbname with post_db.open_connection(conn_config) as conn: db_streams = discover_db(conn) all_streams = all_streams + db_streams cluster_catalog = Catalog(all_streams) dump_catalog(cluster_catalog) return cluster_catalog
def do_discover(self): logger.info("Starting discover") catalog = Catalog([]) for stream in self.streams: stream.tap = self schema = Schema.from_dict(stream.get_schema()) key_properties = stream.key_properties metadata = [] metadata.append({"breadcrumb": [], "metadata": {"selected": True}}) for prop, json_schema in schema.properties.items(): metadata.append( { "breadcrumb": ["properties", prop], "metadata": {"inclusion": "automatic", "selected": True}, } ) catalog.streams.append( CatalogEntry( stream=stream.schema, tap_stream_id=stream.schema, key_properties=key_properties, schema=schema, metadata=metadata, ) ) return catalog
def discover(): catalog = Catalog([]) for tap_stream_id in schemas.STATIC_SCHEMA_STREAM_IDS: #print("tap stream id=",tap_stream_id) schema = Schema.from_dict(schemas.load_schema(tap_stream_id)) metadata = [] for field_name in schema.properties.keys(): #print("field name=",field_name) if field_name in schemas.PK_FIELDS[tap_stream_id]: inclusion = 'automatic' else: inclusion = 'available' metadata.append({ 'metadata': { 'inclusion': inclusion }, 'breadcrumb': ['properties', field_name] }) catalog.streams.append(CatalogEntry( stream=tap_stream_id, tap_stream_id=tap_stream_id, key_properties=schemas.PK_FIELDS[tap_stream_id], schema=schema, metadata=metadata )) return catalog
def run_discovery(cls, args): headers = {'Authorization': 'Bearer {}'.format(args.config['token'])} response = requests.get(url=args.config['metadata_url'] + args.config['base_id'], headers=headers) entries = [] for table in response.json()["tables"]: columns = {} table_name = table["name"] base = { "selected": args.config['selected_by_default'], "name": table_name, "properties": columns } columns["id"] = {"type": ["null", "string"], 'key': True} for field in table["fields"]: if not field["name"] == "Id": columns[field["name"]] = {"type": ["null", "string"]} entry = CatalogEntry(table=table_name, stream=table_name, metadata=base) entries.append(entry) return Catalog(entries).dump()
def discover(): raw_schemas = load_schemas() streams = [] for (stream_id, schema) in raw_schemas.items(): # TODO: populate any metadata and stream's key properties here.. stream_metadata = [{ "breadcrumb": [], "metadata": { "replication-method": "FULL_TABLE", } }] key_properties = [] streams.append( CatalogEntry(tap_stream_id=stream_id, stream=stream_id, schema=schema, key_properties=key_properties, metadata=stream_metadata, replication_key=None, is_view=None, database=None, table=None, row_count=None, stream_alias=None)) return Catalog(streams)
def discover(select_all, client, spreadsheet_id): schemas, field_metadata = get_schemas(client, spreadsheet_id) catalog = Catalog([]) for stream_name, schema_dict in schemas.items(): schema = Schema.from_dict(schema_dict, selected=select_all) mdata = field_metadata[stream_name] key_properties = None for mdt in mdata: table_key_properties = mdt.get('metadata', {}).get('table-key-properties') if table_key_properties: key_properties = table_key_properties catalog.streams.append( CatalogEntry(stream=stream_name, tap_stream_id=stream_name, key_properties=STREAMS.get(stream_name, {}).get( 'key_properties', key_properties), schema=schema, metadata=mdata)) if select_all: select_all_fields_in_streams(catalog) return catalog
def discover(): raw_schemas = load_schemas() streams = [] for stream_id, schema in raw_schemas.items(): key_properties = ['id'] valid_replication_keys = None if stream_id == 'issues': valid_replication_keys = ['updated_at'] elif stream_id == 'messages': valid_replication_keys = ['created_at'] stream_metadata = metadata.get_standard_metadata( schema=schema.to_dict(), key_properties=key_properties, valid_replication_keys=valid_replication_keys, replication_method=None) streams.append( CatalogEntry( tap_stream_id=stream_id, stream=stream_id, schema=schema, key_properties=key_properties, metadata=stream_metadata, replication_key=None, is_view=None, database=None, table=None, row_count=None, stream_alias=None, replication_method=None, )) return Catalog(streams)
def do_discover(self): logger.info('Starting discover') catalog = Catalog([]) for stream in self.streams: stream.tap = self schema = Schema.from_dict(stream.get_schema()) key_properties = stream.key_properties meta = metadata.get_standard_metadata( schema=schema.to_dict(), key_properties=key_properties, valid_replication_keys=[stream.state_field] if stream.state_field else None, replication_method=stream.replication_method) # If the stream has a state_field, it needs to mark that property with automatic metadata if stream.state_field: meta = metadata.to_map(meta) meta[('properties', stream.state_field)]['inclusion'] = 'automatic' meta = metadata.to_list(meta) catalog.streams.append( CatalogEntry(stream=stream.schema, tap_stream_id=stream.schema, key_properties=key_properties, schema=schema, metadata=meta)) return catalog
def test(self): catalog = Catalog( [CatalogEntry(tap_stream_id='a'), CatalogEntry(tap_stream_id='b'), CatalogEntry(tap_stream_id='c')]) entry = catalog.get_stream('b') self.assertEquals('b', entry.tap_stream_id)
def do_discover(self): logger.info('Starting discover') catalog = Catalog([]) for stream in self.streams: stream.tap = self schema = Schema.from_dict(stream.get_schema()) key_properties = stream.key_properties metadata = [] for prop, json_schema in schema.properties.items(): inclusion = 'available' if prop in key_properties: inclusion = 'automatic' metadata.append({ 'breadcrumb': ['properties', prop], 'metadata': { 'inclusion': inclusion } }) catalog.streams.append( CatalogEntry(stream=stream.schema, tap_stream_id=stream.schema, key_properties=key_properties, schema=schema, metadata=metadata)) return catalog
def discover(): schemas = get_schemas() catalog = Catalog([]) for stream_name, schema_dict in schemas.items(): schema = Schema.from_dict(schema_dict) pk = PKS[stream_name] metadata = [] for field_name in schema_dict['properties'].keys(): if field_name in pk: inclusion = 'automatic' else: inclusion = 'available' metadata.append({ 'metadata': { 'inclusion': inclusion }, 'breadcrumb': ['properties', field_name] }) catalog.streams.append( CatalogEntry(stream=stream_name, tap_stream_id=stream_name, key_properties=pk, schema=schema, metadata=metadata)) return catalog
def do_discovery(conn_config): all_streams = [] with post_db.open_connection(conn_config) as conn: with conn.cursor(cursor_factory=psycopg2.extras.DictCursor, name='stitch_cursor') as cur: cur.itersize = post_db.cursor_iter_size sql = """SELECT datname FROM pg_database WHERE datistemplate = false AND CASE WHEN version() LIKE '%Redshift%' THEN true ELSE has_database_privilege(datname,'CONNECT') END = true """ if conn_config.get('filter_dbs'): sql = post_db.filter_dbs_sql_clause(sql, conn_config['filter_dbs']) LOGGER.info("Running DB discovery: %s", sql) cur.execute(sql) filter_dbs = (row[0] for row in cur.fetchall()) for db_row in filter_dbs: dbname = db_row LOGGER.info("Discovering db %s", dbname) conn_config['dbname'] = dbname with post_db.open_connection(conn_config) as conn: db_streams = discover_db(conn) all_streams = all_streams + db_streams cluster_catalog = Catalog(all_streams) dump_catalog(cluster_catalog) return cluster_catalog