class KarapaceSchemaRegistry(KarapaceBase): # pylint: disable=attribute-defined-outside-init def __init__(self, config: dict) -> None: super().__init__(config=config) self._add_schema_registry_routes() self._init_schema_registry(config=config) def _init_schema_registry(self, config: dict) -> None: # pylint: disable=unused-argument self.ksr = None self.producer = None self.producer = self._create_producer() self._create_master_coordinator() self._create_schema_reader() self.schema_lock = asyncio.Lock() def _add_schema_registry_routes(self) -> None: self.route( "/compatibility/subjects/<subject:path>/versions/<version:path>", callback=self.compatibility_check, method="POST", schema_request=True) self.route( "/config/<subject:path>", callback=self.config_subject_get, method="GET", schema_request=True, with_request=True, json_body=False, ) self.route("/config/<subject:path>", callback=self.config_subject_set, method="PUT", schema_request=True) self.route("/config", callback=self.config_get, method="GET", schema_request=True) self.route("/config", callback=self.config_set, method="PUT", schema_request=True) self.route("/schemas/ids/<schema_id:path>/versions", callback=self.schemas_get_versions, method="GET", schema_request=True) self.route("/schemas/ids/<schema_id:path>", callback=self.schemas_get, method="GET", schema_request=True) self.route("/schemas/types", callback=self.schemas_types, method="GET", schema_request=True) self.route("/subjects", callback=self.subjects_list, method="GET", schema_request=True) self.route("/subjects/<subject:path>/versions", callback=self.subject_post, method="POST", schema_request=True) self.route("/subjects/<subject:path>", callback=self.subjects_schema_post, method="POST", schema_request=True) self.route("/subjects/<subject:path>/versions", callback=self.subject_versions_list, method="GET", schema_request=True) self.route("/subjects/<subject:path>/versions/<version>", callback=self.subject_version_get, method="GET", schema_request=True) self.route( "/subjects/<subject:path>/versions/<version:path>", # needs callback=self.subject_version_delete, method="DELETE", schema_request=True, with_request=True, json_body=False, ) self.route("/subjects/<subject:path>/versions/<version>/schema", callback=self.subject_version_schema_get, method="GET", schema_request=True) self.route( "/subjects/<subject:path>", callback=self.subject_delete, method="DELETE", schema_request=True, with_request=True, json_body=False, ) async def close(self) -> None: await super().close() self.log.info("Shutting down all auxiliary threads") if self.mc: self.mc.close() if self.ksr: self.ksr.close() if self.producer: self.producer.close() def _create_schema_reader(self): self.ksr = KafkaSchemaReader(config=self.config, master_coordinator=self.mc) self.ksr.start() def _create_master_coordinator(self): self.mc = MasterCoordinator(config=self.config) self.mc.start() def _subject_get(self, subject, content_type, include_deleted=False) -> Dict[str, Any]: subject_data = self.ksr.subjects.get(subject) if not subject_data: self.r( body={ "error_code": SchemaErrorCodes.SUBJECT_NOT_FOUND.value, "message": SchemaErrorMessages.SUBJECT_NOT_FOUND_FMT.value.format( subject=subject), }, content_type=content_type, status=HTTPStatus.NOT_FOUND, ) schemas = self.ksr.get_schemas(subject, include_deleted=include_deleted) if not schemas: self.r( body={ "error_code": SchemaErrorCodes.SUBJECT_NOT_FOUND.value, "message": SchemaErrorMessages.SUBJECT_NOT_FOUND_FMT.value.format( subject=subject), }, content_type=content_type, status=HTTPStatus.NOT_FOUND, ) subject_data = subject_data.copy() subject_data["schemas"] = schemas return subject_data def _validate_version(self, content_type, version): # pylint: disable=inconsistent-return-statements try: version_number = int(version) if version_number > 0: return version except ValueError: if version == "latest": return version self.r( body={ "error_code": SchemaErrorCodes.INVALID_VERSION_ID.value, "message": (f"The specified version '{version}' is not a valid version id. " "Allowed values are between [1, 2^31-1] and the string \"latest\"" ), }, content_type=content_type, status=HTTPStatus.UNPROCESSABLE_ENTITY, ) def _get_compatibility_mode(self, subject, content_type) -> CompatibilityModes: compatibility = subject.get("compatibility", self.ksr.config["compatibility"]) try: compatibility_mode = CompatibilityModes(compatibility) except ValueError: # Using INTERNAL_SERVER_ERROR because the subject and configuration # should have been validated before. self.r( body={ "error_code": SchemaErrorCodes.HTTP_INTERNAL_SERVER_ERROR.value, "message": f"Unknown compatibility mode {compatibility}", }, content_type=content_type, status=HTTPStatus.INTERNAL_SERVER_ERROR, ) return compatibility_mode def get_offset_from_queue(self, sent_offset): start_time = time.monotonic() while True: self.log.info("Starting to wait for offset: %r from ksr queue", sent_offset) offset = self.ksr.queue.get() if offset == sent_offset: self.log.info( "We've consumed back produced offset: %r message back, everything is in sync, took: %.4f", offset, time.monotonic() - start_time) break self.log.warning( "Put the offset: %r back to queue, someone else is waiting for this?", offset) self.ksr.queue.put(offset) def send_kafka_message(self, key, value): if isinstance(key, str): key = key.encode("utf8") if isinstance(value, str): value = value.encode("utf8") future = self.producer.send(self.config["topic_name"], key=key, value=value) self.producer.flush(timeout=self.kafka_timeout) msg = future.get(self.kafka_timeout) self.log.debug("Sent kafka msg key: %r, value: %r, offset: %r", key, value, msg.offset) self.get_offset_from_queue(msg.offset) return future def send_schema_message( self, *, subject: str, schema: Optional[TypedSchema], schema_id: int, version: int, deleted: bool, ): key = '{{"subject":"{}","version":{},"magic":1,"keytype":"SCHEMA"}}'.format( subject, version) if schema: valuedict = { "subject": subject, "version": version, "id": schema_id, "schema": schema.schema_str, "deleted": deleted } if schema.schema_type is not SchemaType.AVRO: valuedict["schemaType"] = schema.schema_type value = json_encode(valuedict, compact=True) else: value = "" return self.send_kafka_message(key, value) def send_config_message(self, compatibility_level: CompatibilityModes, subject=None): if subject is not None: key = '{{"subject":"{}","magic":0,"keytype":"CONFIG"}}'.format( subject) else: key = '{"subject":null,"magic":0,"keytype":"CONFIG"}' value = '{{"compatibilityLevel":"{}"}}'.format( compatibility_level.value) return self.send_kafka_message(key, value) def send_delete_subject_message(self, subject, version): key = '{{"subject":"{}","magic":0,"keytype":"DELETE_SUBJECT"}}'.format( subject) value = '{{"subject":"{}","version":{}}}'.format(subject, version) return self.send_kafka_message(key, value) async def compatibility_check(self, content_type, *, subject, version, request): """Check for schema compatibility""" body = request.json self.log.info( "Got request to check subject: %r, version_id: %r compatibility", subject, version) old = await self.subject_version_get(content_type=content_type, subject=subject, version=version, return_dict=True) self.log.info("Existing schema: %r, new_schema: %r", old["schema"], body["schema"]) try: schema_type = SchemaType(body.get("schemaType", "AVRO")) new_schema = TypedSchema.parse(schema_type, body["schema"]) except InvalidSchema: self.log.warning("Invalid schema: %r", body["schema"]) self.r( body={ "error_code": SchemaErrorCodes.INVALID_AVRO_SCHEMA.value, "message": "Invalid Avro schema", }, content_type=content_type, status=HTTPStatus.UNPROCESSABLE_ENTITY, ) try: old_schema_type = SchemaType(old.get("schemaType", "AVRO")) old_schema = TypedSchema.parse(old_schema_type, old["schema"]) except InvalidSchema: self.log.warning("Invalid existing schema: %r", old["schema"]) self.r( body={ "error_code": SchemaErrorCodes.INVALID_AVRO_SCHEMA.value, "message": "Invalid Avro schema", }, content_type=content_type, status=HTTPStatus.UNPROCESSABLE_ENTITY, ) compatibility_mode = self._get_compatibility_mode( subject=old, content_type=content_type) result = check_compatibility( old_schema=old_schema, new_schema=new_schema, compatibility_mode=compatibility_mode, ) if is_incompatible(result): self.log.warning( "Invalid schema %s found by compatibility check: old: %s new: %s", result, old_schema, new_schema) self.r({"is_compatible": False}, content_type) self.r({"is_compatible": True}, content_type) async def schemas_get(self, content_type, *, schema_id): try: schema_id_int = int(schema_id) except ValueError: self.r( body={ "error_code": SchemaErrorCodes.HTTP_NOT_FOUND.value, "message": "HTTP 404 Not Found", }, content_type=content_type, status=HTTPStatus.NOT_FOUND, ) with self.ksr.id_lock: schema = self.ksr.schemas.get(schema_id_int) if not schema: self.log.warning("Schema: %r that was requested, not found", int(schema_id)) self.r( body={ "error_code": SchemaErrorCodes.SCHEMA_NOT_FOUND.value, "message": "Schema not found", }, content_type=content_type, status=HTTPStatus.NOT_FOUND, ) response_body = {"schema": schema.schema_str} if schema.schema_type is not SchemaType.AVRO: response_body["schemaType"] = schema.schema_type self.r(response_body, content_type) async def schemas_get_versions(self, content_type, *, schema_id): try: schema_id_int = int(schema_id) except ValueError: self.r( body={ "error_code": SchemaErrorCodes.HTTP_NOT_FOUND.value, "message": "HTTP 404 Not Found", }, content_type=content_type, status=HTTPStatus.NOT_FOUND, ) subject_versions = [] with self.ksr.id_lock: for subject, val in self.ksr.subjects.items(): if self.ksr.get_schemas(subject) and "schemas" in val: schemas = val["schemas"] for version, schema in schemas.items(): if int(schema["id"] ) == schema_id_int and not schema["deleted"]: subject_versions.append({ "subject": subject, "version": int(version) }) subject_versions = sorted(subject_versions, key=lambda s: (s["subject"], s["version"])) self.r(subject_versions, content_type) async def schemas_types(self, content_type): self.r(["JSON", "AVRO"], content_type) async def config_get(self, content_type): # Note: The format sent by the user differs from the return value, this # is for compatibility reasons. self.r({"compatibilityLevel": self.ksr.config["compatibility"]}, content_type) async def config_set(self, content_type, *, request): body = request.json try: compatibility_level = CompatibilityModes( request.json["compatibility"]) except (ValueError, KeyError): self.r( body={ "error_code": SchemaErrorCodes.INVALID_COMPATIBILITY_LEVEL.value, "message": SchemaErrorMessages.INVALID_COMPATIBILITY_LEVEL.value, }, content_type=content_type, status=HTTPStatus.UNPROCESSABLE_ENTITY, ) are_we_master, master_url = await self.get_master() if are_we_master: self.send_config_message(compatibility_level=compatibility_level, subject=None) elif not master_url: self.no_master_error(content_type) else: url = f"{master_url}/config" await self.forward_request_remote(body=body, url=url, content_type=content_type, method="PUT") self.r({"compatibility": self.ksr.config["compatibility"]}, content_type) async def config_subject_get(self, content_type, subject: str, *, request: HTTPRequest): # Config for a subject can exist without schemas so no need to check for their existence assert self.ksr, "KarapaceSchemaRegistry not initialized. Missing call to _init" subject_data = self.ksr.subjects.get(subject, {}) if subject_data: default_to_global = request.query.get("defaultToGlobal", "false").lower() == "true" compatibility = subject_data.get("compatibility") if not compatibility and default_to_global: compatibility = self.ksr.config["compatibility"] if compatibility: # Note: The format sent by the user differs from the return # value, this is for compatibility reasons. self.r( {"compatibilityLevel": compatibility}, content_type, ) self.r( body={ "error_code": SchemaErrorCodes.SUBJECT_NOT_FOUND.value, "message": SchemaErrorMessages.SUBJECT_NOT_FOUND_FMT.value.format( subject=subject), }, content_type=content_type, status=HTTPStatus.NOT_FOUND, ) async def config_subject_set(self, content_type, *, request, subject): try: compatibility_level = CompatibilityModes( request.json["compatibility"]) except (ValueError, KeyError): self.r( body={ "error_code": SchemaErrorCodes.INVALID_COMPATIBILITY_LEVEL.value, "message": "Invalid compatibility level", }, content_type=content_type, status=HTTPStatus.UNPROCESSABLE_ENTITY, ) are_we_master, master_url = await self.get_master() if are_we_master: self.send_config_message(compatibility_level=compatibility_level, subject=subject) elif not master_url: self.no_master_error(content_type) else: url = f"{master_url}/config/{subject}" await self.forward_request_remote(body=request.json, url=url, content_type=content_type, method="PUT") self.r({"compatibility": compatibility_level.value}, content_type) async def subjects_list(self, content_type): subjects_list = [ key for key, val in self.ksr.subjects.items() if self.ksr.get_schemas(key) ] self.r(subjects_list, content_type, status=HTTPStatus.OK) async def _subject_delete_local(self, content_type: str, subject: str, permanent: bool): subject_data = self._subject_get(subject, content_type, include_deleted=permanent) if permanent and [ version for version, value in subject_data["schemas"].items() if not value.get("deleted", False) ]: self.r( body={ "error_code": SchemaErrorCodes.SUBJECT_NOT_SOFT_DELETED.value, "message": f"Subject '{subject}' was not deleted first before being permanently deleted", }, content_type=content_type, status=HTTPStatus.NOT_FOUND, ) version_list = list(subject_data["schemas"]) if version_list: latest_schema_id = version_list[-1] else: latest_schema_id = 0 if permanent: for version, value in list(subject_data["schemas"].items()): schema_id = value.get("id") self.log.info( "Permanently deleting subject '%s' version %s (schema id=%s)", subject, version, schema_id) self.send_schema_message(subject=subject, schema=None, schema_id=schema_id, version=version, deleted=True) else: self.send_delete_subject_message(subject, latest_schema_id) self.r(version_list, content_type, status=HTTPStatus.OK) async def subject_delete(self, content_type, *, subject, request: HTTPRequest): permanent = request.query.get("permanent", "false").lower() == "true" are_we_master, master_url = await self.get_master() if are_we_master: async with self.schema_lock: await self._subject_delete_local(content_type, subject, permanent) elif not master_url: self.no_master_error(content_type) else: url = f"{master_url}/subjects/{subject}?permanent={permanent}" await self.forward_request_remote(body={}, url=url, content_type=content_type, method="DELETE") async def subject_version_get(self, content_type, *, subject, version, return_dict=False): self._validate_version(content_type, version) subject_data = self._subject_get(subject, content_type) schema_data = None max_version = max(subject_data["schemas"]) if version == "latest": version = max(subject_data["schemas"]) schema_data = subject_data["schemas"][version] elif int(version) <= max_version: schema_data = subject_data["schemas"].get(int(version)) else: self.r( body={ "error_code": SchemaErrorCodes.VERSION_NOT_FOUND.value, "message": f"Version {version} not found.", }, content_type=content_type, status=HTTPStatus.NOT_FOUND, ) if not schema_data: self.r( body={ "error_code": SchemaErrorCodes.VERSION_NOT_FOUND.value, "message": f"Version {version} not found.", }, content_type=content_type, status=HTTPStatus.NOT_FOUND, ) schema_id = schema_data["id"] schema = schema_data["schema"] ret = { "subject": subject, "version": int(version), "id": schema_id, "schema": schema.schema_str, } if schema.schema_type is not SchemaType.AVRO: ret["schemaType"] = schema.schema_type if return_dict: # Return also compatibility information to compatibility check if subject_data.get("compatibility"): ret["compatibility"] = subject_data.get("compatibility") return ret self.r(ret, content_type) async def _subject_version_delete_local(self, content_type: str, subject: str, version: int, permanent: bool): subject_data = self._subject_get(subject, content_type, include_deleted=True) subject_schema_data = subject_data["schemas"].get(version, None) if not subject_schema_data: self.r( body={ "error_code": SchemaErrorCodes.VERSION_NOT_FOUND.value, "message": f"Version {version} not found.", }, content_type=content_type, status=HTTPStatus.NOT_FOUND, ) if subject_schema_data.get("deleted", False) and not permanent: self.r( body={ "error_code": SchemaErrorCodes.SCHEMAVERSION_SOFT_DELETED.value, "message": f"Subject '{subject}' Version 1 was soft deleted.Set permanent=true to delete permanently", }, content_type=content_type, status=HTTPStatus.NOT_FOUND, ) # Cannot directly hard delete if permanent and not subject_schema_data.get("deleted", False): self.r( body={ "error_code": SchemaErrorCodes.SCHEMAVERSION_NOT_SOFT_DELETED.value, "message": (f"Subject '{subject}' Version {version} was not deleted " "first before being permanently deleted"), }, content_type=content_type, status=HTTPStatus.NOT_FOUND, ) schema_id = subject_schema_data["id"] schema = subject_schema_data["schema"] self.send_schema_message(subject=subject, schema=None if permanent else schema, schema_id=schema_id, version=version, deleted=True) self.r(str(version), content_type, status=HTTPStatus.OK) async def subject_version_delete(self, content_type, *, subject, version, request: HTTPRequest): version = int(version) permanent = request.query.get("permanent", "false").lower() == "true" are_we_master, master_url = await self.get_master() if are_we_master: async with self.schema_lock: await self._subject_version_delete_local( content_type, subject, version, permanent) elif not master_url: self.no_master_error(content_type) else: url = f"{master_url}/subjects/{subject}/versions/{version}?permanent={permanent}" await self.forward_request_remote(body={}, url=url, content_type=content_type, method="DELETE") async def subject_version_schema_get(self, content_type, *, subject, version): self._validate_version(content_type, version) subject_data = self._subject_get(subject, content_type) max_version = max(subject_data["schemas"]) if version == "latest": schema_data = subject_data["schemas"][max_version] elif int(version) <= max_version: schema_data = subject_data["schemas"].get(int(version)) else: self.r( body={ "error_code": SchemaErrorCodes.VERSION_NOT_FOUND.value, "message": f"Version {version} not found.", }, content_type=content_type, status=HTTPStatus.NOT_FOUND, ) self.r(schema_data["schema"].schema_str, content_type) async def subject_versions_list(self, content_type, *, subject): subject_data = self._subject_get(subject, content_type) self.r(list(subject_data["schemas"]), content_type, status=HTTPStatus.OK) async def get_master(self) -> Tuple[bool, Optional[str]]: async with self.master_lock: while True: are_we_master, master_url = self.mc.get_master_info() if are_we_master is None: self.log.info("No master set: %r, url: %r", are_we_master, master_url) elif self.ksr.ready is False: self.log.info("Schema reader isn't ready yet: %r", self.ksr.ready) else: return are_we_master, master_url await asyncio.sleep(1.0) def _validate_schema_request_body(self, content_type, body) -> None: if not isinstance(body, dict): self.r( body={ "error_code": SchemaErrorCodes.HTTP_INTERNAL_SERVER_ERROR.value, "message": "Internal Server Error", }, content_type=content_type, status=HTTPStatus.INTERNAL_SERVER_ERROR, ) for field in body: if field not in {"schema", "schemaType"}: self.r( body={ "error_code": SchemaErrorCodes.HTTP_UNPROCESSABLE_ENTITY.value, "message": f"Unrecognized field: {field}", }, content_type=content_type, status=HTTPStatus.UNPROCESSABLE_ENTITY, ) def _validate_schema_type(self, content_type, body) -> None: schema_type = SchemaType(body.get("schemaType", SchemaType.AVRO.value)) if schema_type not in {SchemaType.JSONSCHEMA, SchemaType.AVRO}: self.r( body={ "error_code": SchemaErrorCodes.HTTP_UNPROCESSABLE_ENTITY.value, "message": f"unrecognized schemaType: {schema_type}", }, content_type=content_type, status=HTTPStatus.UNPROCESSABLE_ENTITY, ) def _validate_schema_key(self, content_type, body) -> None: if "schema" not in body: self.r( body={ "error_code": SchemaErrorCodes.EMPTY_SCHEMA.value, "message": "Empty schema", }, content_type=content_type, status=HTTPStatus.UNPROCESSABLE_ENTITY, ) async def subjects_schema_post(self, content_type, *, subject, request): body = request.json self._validate_schema_request_body(content_type, body) subject_data = self._subject_get(subject, content_type) new_schema = None if "schema" not in body: self.r( body={ "error_code": SchemaErrorCodes.HTTP_INTERNAL_SERVER_ERROR.value, "message": f"Error while looking up schema under subject {subject}", }, content_type=content_type, status=HTTPStatus.INTERNAL_SERVER_ERROR, ) schema_str = body["schema"] schema_type = SchemaType(body.get("schemaType", "AVRO")) try: new_schema = TypedSchema.parse(schema_type, schema_str) except InvalidSchema: self.log.exception("No proper parser found") self.r( body={ "error_code": SchemaErrorCodes.HTTP_INTERNAL_SERVER_ERROR.value, "message": f"Error while looking up schema under subject {subject}", }, content_type=content_type, status=HTTPStatus.INTERNAL_SERVER_ERROR, ) for schema in subject_data["schemas"].values(): typed_schema = schema["schema"] if typed_schema == new_schema: ret = { "subject": subject, "version": schema["version"], "id": schema["id"], "schema": typed_schema.schema_str, } if schema_type is not SchemaType.AVRO: ret["schemaType"] = schema_type self.r(ret, content_type) else: self.log.debug("Schema %r did not match %r", schema, typed_schema) self.r( body={ "error_code": SchemaErrorCodes.SCHEMA_NOT_FOUND.value, "message": "Schema not found", }, content_type=content_type, status=HTTPStatus.NOT_FOUND, ) async def subject_post(self, content_type, *, subject, request): body = request.json self.log.debug("POST with subject: %r, request: %r", subject, body) self._validate_schema_request_body(content_type, body) self._validate_schema_type(content_type, body) self._validate_schema_key(content_type, body) are_we_master, master_url = await self.get_master() if are_we_master: async with self.schema_lock: await self.write_new_schema_local(subject, body, content_type) elif not master_url: self.no_master_error(content_type) else: url = f"{master_url}/subjects/{subject}/versions" await self.forward_request_remote(body=body, url=url, content_type=content_type, method="POST") def write_new_schema_local(self, subject, body, content_type): """Since we're the master we get to write the new schema""" self.log.info("Writing new schema locally since we're the master") schema_type = SchemaType(body.get("schemaType", SchemaType.AVRO)) try: new_schema = TypedSchema.parse(schema_type=schema_type, schema_str=body["schema"]) except (InvalidSchema, InvalidSchemaType) as e: self.log.warning("Invalid schema: %r", body["schema"], exc_info=True) if isinstance(e.__cause__, (SchemaParseException, JSONDecodeError)): human_error = f"{e.__cause__.args[0]}" # pylint: disable=no-member else: human_error = "Provided schema is not valid" self.r( body={ "error_code": SchemaErrorCodes.INVALID_AVRO_SCHEMA.value, "message": f"Invalid {schema_type} schema. Error: {human_error}", }, content_type=content_type, status=HTTPStatus.UNPROCESSABLE_ENTITY, ) if subject not in self.ksr.subjects or not self.ksr.subjects.get( subject)["schemas"]: schema_id = self.ksr.get_schema_id(new_schema) version = 1 self.log.info( "Registering new subject: %r with version: %r to schema %r, schema_id: %r", subject, version, new_schema.schema_str, schema_id) else: # First check if any of the existing schemas for the subject match subject_data = self.ksr.subjects[subject] schemas = self.ksr.get_schemas(subject) if not schemas: # Previous ones have been deleted by the user. version = max(self.ksr.subjects[subject]["schemas"]) + 1 schema_id = self.ksr.get_schema_id(new_schema) self.log.info( "Registering subject: %r, id: %r new version: %r with schema %r, schema_id: %r", subject, schema_id, version, new_schema.schema_str, schema_id) self.send_schema_message( subject=subject, schema=new_schema, schema_id=schema_id, version=version, deleted=False, ) self.r({"id": schema_id}, content_type) schema_versions = sorted(list(schemas)) # Go through these in version order for version in schema_versions: schema = subject_data["schemas"][version] if schema["schema"] == new_schema: self.r({"id": schema["id"]}, content_type) else: self.log.debug("schema: %s did not match with: %s", schema, new_schema) compatibility_mode = self._get_compatibility_mode( subject=subject_data, content_type=content_type) # Run a compatibility check between on file schema(s) and the one being submitted now # the check is either towards the latest one or against all previous ones in case of # transitive mode if compatibility_mode.is_transitive(): check_against = schema_versions else: check_against = [schema_versions[-1]] for old_version in check_against: old_schema = subject_data["schemas"][old_version]["schema"] result = check_compatibility( old_schema=old_schema, new_schema=new_schema, compatibility_mode=compatibility_mode, ) if is_incompatible(result): message = set( result.messages).pop() if result.messages else "" self.log.warning("Incompatible schema: %s", result) self.r( body={ "error_code": SchemaErrorCodes.HTTP_CONFLICT.value, "message": f"Incompatible schema, compatibility_mode={compatibility_mode.value} {message}", }, content_type=content_type, status=HTTPStatus.CONFLICT, ) # We didn't find an existing schema and the schema is compatible so go and create one schema_id = self.ksr.get_schema_id(new_schema) version = max(self.ksr.subjects[subject]["schemas"]) + 1 self.log.info( "Registering subject: %r, id: %r new version: %r with schema %r, schema_id: %r", subject, schema_id, version, new_schema.to_json(), schema_id) self.send_schema_message( subject=subject, schema=new_schema, schema_id=schema_id, version=version, deleted=False, ) self.r({"id": schema_id}, content_type) async def forward_request_remote(self, *, body, url, content_type, method="POST"): self.log.info( "Writing new schema to remote url: %r since we're not the master", url) response = await self.http_request(url=url, method=method, json=body, timeout=60.0) self.r(body=response.body, content_type=content_type, status=HTTPStatus(response.status)) def no_master_error(self, content_type): self.r( body={ "error_code": SchemaErrorCodes.NO_MASTER_ERROR.value, "message": "Error while forwarding the request to the master.", }, content_type=content_type, status=HTTPStatus.INTERNAL_SERVER_ERROR, )
def _create_master_coordinator(self): self.mc = MasterCoordinator(config=self.config) self.mc.start()
class KarapaceSchemaRegistry(KarapaceBase): # pylint: disable=attribute-defined-outside-init def __init__(self, config): super().__init__(config) self._add_routes() self._init() def _init(self): self.ksr = None self.producer = None self.producer = self._create_producer() self._create_master_coordinator() self._create_schema_reader() def _add_routes(self): self.route( "/compatibility/subjects/<subject:path>/versions/<version:path>", callback=self.compatibility_check, method="POST", schema_request=True) self.route( "/config/<subject:path>", callback=self.config_subject_get, method="GET", schema_request=True, with_request=True, json_body=False, ) self.route("/config/<subject:path>", callback=self.config_subject_set, method="PUT", schema_request=True) self.route("/config", callback=self.config_get, method="GET", schema_request=True) self.route("/config", callback=self.config_set, method="PUT", schema_request=True) self.route("/schemas/ids/<schema_id:path>", callback=self.schemas_get, method="GET", schema_request=True) self.route("/subjects", callback=self.subjects_list, method="GET", schema_request=True) self.route("/subjects/<subject:path>/versions", callback=self.subject_post, method="POST", schema_request=True) self.route("/subjects/<subject:path>", callback=self.subjects_schema_post, method="POST", schema_request=True) self.route("/subjects/<subject:path>/versions", callback=self.subject_versions_list, method="GET", schema_request=True) self.route("/subjects/<subject:path>/versions/<version>", callback=self.subject_version_get, method="GET", schema_request=True) self.route( "/subjects/<subject:path>/versions/<version:path>", # needs callback=self.subject_version_delete, method="DELETE", schema_request=True) self.route("/subjects/<subject:path>/versions/<version>/schema", callback=self.subject_version_schema_get, method="GET", schema_request=True) self.route("/subjects/<subject:path>", callback=self.subject_delete, method="DELETE", schema_request=True) # needs def close(self): super().close() self.log.info("Shutting down all auxiliary threads") if self.mc: self.mc.close() if self.ksr: self.ksr.close() if self.producer: self.producer.close() def _create_schema_reader(self): self.ksr = KafkaSchemaReader(config=self.config, master_coordinator=self.mc) self.ksr.start() def _create_master_coordinator(self): self.mc = MasterCoordinator(config=self.config) self.mc.start() def _subject_get(self, subject, content_type): subject_data = self.ksr.subjects.get(subject) if not subject_data: self.r({ "error_code": 40401, "message": "Subject not found." }, content_type, status=404) schemas = self.ksr.get_schemas(subject) if not schemas: self.r({ "error_code": 40401, "message": "Subject not found." }, content_type, status=404) subject_data["schemas"] = schemas return subject_data def _validate_version(self, content_type, version): # pylint: disable=inconsistent-return-statements try: version_number = int(version) if version_number > 0: return version except ValueError: if version == "latest": return version self.r(body={ "error_code": 42202, "message": "The specified version is not a valid version id. " "Allowed values are between [1, 2^31-1] and the string \"latest\"" }, content_type=content_type, status=422) def get_offset_from_queue(self, sent_offset): start_time = time.monotonic() while True: self.log.info("Starting to wait for offset: %r from ksr queue", sent_offset) offset = self.ksr.queue.get() if offset == sent_offset: self.log.info( "We've consumed back produced offset: %r message back, everything is in sync, took: %.4f", offset, time.monotonic() - start_time) break self.log.warning( "Put the offset: %r back to queue, someone else is waiting for this?", offset) self.ksr.queue.put(offset) def send_kafka_message(self, key, value): if isinstance(key, str): key = key.encode("utf8") if isinstance(value, str): value = value.encode("utf8") future = self.producer.send(self.config["topic_name"], key=key, value=value) self.producer.flush(timeout=self.kafka_timeout) msg = future.get(self.kafka_timeout) self.log.debug("Sent kafka msg key: %r, value: %r, offset: %r", key, value, msg.offset) self.get_offset_from_queue(msg.offset) return future def send_schema_message( self, *, subject: str, schema: TypedSchema, schema_id: int, version: int, deleted: bool, ): key = '{{"subject":"{}","version":{},"magic":1,"keytype":"SCHEMA"}}'.format( subject, version) value = { "subject": subject, "version": version, "id": schema_id, "schema": json_encode(schema.to_json(), compact=True), "deleted": deleted } if schema.schema_type is not SchemaType.AVRO: value["schemaType"] = schema.schema_type return self.send_kafka_message(key, json_encode(value, compact=True)) def send_config_message(self, compatibility_level, subject=None): if subject is not None: key = '{{"subject":"{}","magic":0,"keytype":"CONFIG"}}'.format( subject) else: key = '{"subject":null,"magic":0,"keytype":"CONFIG"}' value = '{{"compatibilityLevel":"{}"}}'.format(compatibility_level) return self.send_kafka_message(key, value) def send_delete_subject_message(self, subject, version): key = '{{"subject":"{}","magic":0,"keytype":"DELETE_SUBJECT"}}'.format( subject) value = '{{"subject":"{}","version":{}}}'.format(subject, version) return self.send_kafka_message(key, value) async def compatibility_check(self, content_type, *, subject, version, request): """Check for schema compatibility""" body = request.json self.log.info( "Got request to check subject: %r, version_id: %r compatibility", subject, version) old = await self.subject_version_get(content_type=content_type, subject=subject, version=version, return_dict=True) self.log.info("Existing schema: %r, new_schema: %r", old["schema"], body["schema"]) try: schema_type = SchemaType(body.get("schemaType", "AVRO")) new = TypedSchema.parse(schema_type, body["schema"]) except InvalidSchema: self.log.warning("Invalid schema: %r", body["schema"]) self.r(body={ "error_code": 44201, "message": "Invalid Avro schema" }, content_type=content_type, status=422) try: old_schema_type = SchemaType(old.get("schemaType", "AVRO")) old_schema = TypedSchema.parse(old_schema_type, old["schema"]) except InvalidSchema: self.log.warning("Invalid existing schema: %r", old["schema"]) self.r(body={ "error_code": 44201, "message": "Invalid Avro schema" }, content_type=content_type, status=422) compat = Compatibility(source=old_schema, target=new, compatibility=old.get( "compatibility", self.ksr.config["compatibility"])) try: compat.check() except IncompatibleSchema as ex: self.log.warning( "Invalid schema %s found by compatibility check: old: %s new: %s", ex, old_schema, new) self.r({"is_compatible": False}, content_type) self.r({"is_compatible": True}, content_type) async def schemas_get(self, content_type, *, schema_id): try: schema_id_int = int(schema_id) except ValueError: self.r({ "error_code": 404, "message": "HTTP 404 Not Found" }, content_type, status=404) with self.ksr.id_lock: schema = self.ksr.schemas.get(schema_id_int) if not schema: self.log.warning("Schema: %r that was requested, not found", int(schema_id)) self.r(body={ "error_code": 40403, "message": "Schema not found" }, content_type=content_type, status=404) response_body = {"schema": str(schema)} if schema.schema_type is not SchemaType.AVRO: response_body["schemaType"] = schema.schema_type self.r(response_body, content_type) async def config_get(self, content_type): self.r({"compatibilityLevel": self.ksr.config["compatibility"]}, content_type) async def config_set(self, content_type, *, request): body = request.json if "compatibility" in request.json and request.json[ "compatibility"] in COMPATIBILITY_MODES: compatibility_level = request.json["compatibility"] are_we_master, master_url = await self.get_master() if are_we_master: self.send_config_message( compatibility_level=compatibility_level, subject=None) elif are_we_master is None: self.no_master_error(content_type) else: url = f"{master_url}/config" await self.forward_request_remote(body=body, url=url, content_type=content_type, method="PUT") else: self.r(body={ "error_code": 42203, "message": "Invalid compatibility level. Valid values are none, backward, forward and full" }, content_type=content_type, status=422) self.r({"compatibility": self.ksr.config["compatibility"]}, content_type) async def config_subject_get(self, content_type, subject: str, *, request: HTTPRequest): # Config for a subject can exist without schemas so no need to check for their existence subject_data = self.ksr.subjects.get(subject, {}) if subject_data: default_to_global = request.query.get("defaultToGlobal", "false").lower() == "true" compatibility = subject_data.get("compatibility") if not compatibility and default_to_global: compatibility = self.ksr.config["compatibility"] if compatibility: self.r({"compatibilityLevel": compatibility}, content_type) self.r({ "error_code": 40401, "message": "Subject not found." }, content_type, status=404) async def config_subject_set(self, content_type, *, request, subject): if "compatibility" in request.json and request.json[ "compatibility"] in COMPATIBILITY_MODES: are_we_master, master_url = await self.get_master() if are_we_master: self.send_config_message( compatibility_level=request.json["compatibility"], subject=subject) elif are_we_master is None: self.no_master_error(content_type) else: url = f"{master_url}/config/{subject}" await self.forward_request_remote(body=request.json, url=url, content_type=content_type, method="PUT") else: self.r(body={ "error_code": 42203, "message": "Invalid compatibility level" }, content_type=content_type, status=422) self.r({"compatibility": request.json["compatibility"]}, content_type) async def subjects_list(self, content_type): subjects_list = [ key for key, val in self.ksr.subjects.items() if self.ksr.get_schemas(key) ] self.r(subjects_list, content_type, status=200) async def subject_delete(self, content_type, *, subject): self._subject_get(subject, content_type) version_list = list(self.ksr.get_schemas(subject)) if version_list: latest_schema_id = version_list[-1] else: latest_schema_id = 0 are_we_master, master_url = await self.get_master() if are_we_master: self.send_delete_subject_message(subject, latest_schema_id) self.r(version_list, content_type, status=200) elif are_we_master is None: self.no_master_error(content_type) else: url = f"{master_url}/subjects/{subject}" await self.forward_request_remote(body={}, url=url, content_type=content_type, method="DELETE") async def subject_version_get(self, content_type, *, subject, version, return_dict=False): self._validate_version(content_type, version) subject_data = self._subject_get(subject, content_type) schema_data = None max_version = max(subject_data["schemas"]) if version == "latest": version = max(subject_data["schemas"]) schema_data = subject_data["schemas"][version] elif int(version) <= max_version: schema_data = subject_data["schemas"].get(int(version)) else: self.r({ "error_code": 40402, "message": "Version not found." }, content_type, status=404) if not schema_data: self.r({ "error_code": 40402, "message": "Version not found." }, content_type, status=404) schema_id = schema_data["id"] schema = schema_data["schema"] ret = { "subject": subject, "version": int(version), "id": schema_id, "schema": str(schema), } if schema.schema_type is not SchemaType.AVRO: ret["schemaType"] = schema.schema_type if return_dict: # Return also compatibility information to compatibility check if subject_data.get("compatibility"): ret["compatibility"] = subject_data.get("compatibility") return ret self.r(ret, content_type) async def subject_version_delete(self, content_type, *, subject, version): version = int(version) subject_data = self._subject_get(subject, content_type) subject_schema_data = subject_data["schemas"].get(version, None) if not subject_schema_data: self.r({ "error_code": 40402, "message": "Version not found." }, content_type, status=404) schema_id = subject_schema_data["id"] schema = subject_schema_data["schema"] are_we_master, master_url = await self.get_master() if are_we_master: self.send_schema_message(subject=subject, schema=schema, schema_id=schema_id, version=version, deleted=True) self.r(str(version), content_type, status=200) elif are_we_master is None: self.no_master_error(content_type) else: url = f"{master_url}/subjects/{subject}/versions/{version}" await self.forward_request_remote(body={}, url=url, content_type=content_type, method="DELETE") async def subject_version_schema_get(self, content_type, *, subject, version): self._validate_version(content_type, version) subject_data = self._subject_get(subject, content_type) max_version = max(subject_data["schemas"]) if version == "latest": schema_data = subject_data["schemas"][max_version] elif int(version) <= max_version: schema_data = subject_data["schemas"].get(int(version)) else: self.r({ "error_code": 40402, "message": "Version not found." }, content_type, status=404) self.r(str(schema_data["schema"]), content_type) async def subject_versions_list(self, content_type, *, subject): subject_data = self._subject_get(subject, content_type) self.r(list(subject_data["schemas"]), content_type, status=200) async def get_master(self): async with self.master_lock: while True: master, master_url = self.mc.get_master_info() if master is None: self.log.info("No master set: %r, url: %r", master, master_url) elif self.ksr.ready is False: self.log.info("Schema reader isn't ready yet: %r", self.ksr.ready) else: return master, master_url await asyncio.sleep(1.0) def _validate_schema_request_body(self, content_type, body): if not isinstance(body, dict): self.r({ "error_code": 500, "message": "Internal Server Error" }, content_type, status=500) for field in body: if field not in {"schema", "schemaType"}: self.r(body={ "error_code": 422, "message": f"Unrecognized field: {field}", }, content_type=content_type, status=422) def _validate_schema_type(self, content_type, body): schema_type = SchemaType(body.get("schemaType", SchemaType.AVRO.value)) if schema_type not in {SchemaType.JSONSCHEMA, SchemaType.AVRO}: self.r(body={ "error_code": 422, "message": f"unrecognized schemaType: {schema_type}" }, content_type=content_type, status=422) async def subjects_schema_post(self, content_type, *, subject, request): body = request.json self._validate_schema_request_body(content_type, body) subject_data = self._subject_get(subject, content_type) new_schema = None if "schema" not in body: self.r({ "error_code": 500, "message": "Internal Server Error" }, content_type, status=500) schema_str = body["schema"] schema_type = SchemaType(body.get("schemaType", "AVRO")) try: new_schema = TypedSchema.parse(schema_type, schema_str) except InvalidSchema: self.log.exception("No proper parser found") self.r( { "error_code": 500, "message": f"Error while looking up schema under subject {subject}" }, content_type, status=500) for schema in subject_data["schemas"].values(): typed_schema = schema["schema"] if typed_schema == new_schema: ret = { "subject": subject, "version": schema["version"], "id": schema["id"], "schema": str(typed_schema), } if schema_type is not SchemaType.AVRO: ret["schemaType"] = schema_type self.r(ret, content_type) else: self.log.debug("Schema %r did not match %r", schema, typed_schema) self.r({ "error_code": 40403, "message": "Schema not found" }, content_type, status=404) async def subject_post(self, content_type, *, subject, request): body = request.json self.log.debug("POST with subject: %r, request: %r", subject, body) self._validate_schema_request_body(content_type, body) self._validate_schema_type(content_type, body) if "schema" not in body: self.r({ "error_code": 500, "message": "Internal Server Error" }, content_type, status=500) are_we_master, master_url = await self.get_master() if are_we_master: self.write_new_schema_local(subject, body, content_type) elif are_we_master is None: self.no_master_error(content_type) else: url = f"{master_url}/subjects/{subject}/versions" await self.forward_request_remote(body=body, url=url, content_type=content_type, method="POST") def write_new_schema_local(self, subject, body, content_type): """Since we're the master we get to write the new schema""" self.log.info("Writing new schema locally since we're the master") schema_type = SchemaType(body.get("schemaType", SchemaType.AVRO)) try: new_schema = TypedSchema.parse(schema_type=schema_type, schema_str=body["schema"]) except (InvalidSchema, InvalidSchemaType): self.log.warning("Invalid schema: %r", body["schema"], exc_info=True) self.r(body={ "error_code": 44201, "message": f"Invalid {schema_type} schema" }, content_type=content_type, status=422) if subject not in self.ksr.subjects or not self.ksr.subjects.get( subject)["schemas"]: schema_id = self.ksr.get_schema_id(new_schema) version = 1 self.log.info( "Registering new subject: %r with version: %r to schema %r, schema_id: %r", subject, version, new_schema.to_json(), schema_id) else: # First check if any of the existing schemas for the subject match subject_data = self.ksr.subjects[subject] schemas = self.ksr.get_schemas(subject) if not schemas: # Previous ones have been deleted by the user. version = max(self.ksr.subjects[subject]["schemas"]) + 1 schema_id = self.ksr.get_schema_id(new_schema) self.log.info( "Registering subject: %r, id: %r new version: %r with schema %r, schema_id: %r", subject, schema_id, version, new_schema.to_json(), schema_id) self.send_schema_message( subject=subject, schema=new_schema, schema_id=schema_id, version=version, deleted=False, ) self.r({"id": schema_id}, content_type) schema_versions = sorted(list(schemas)) # Go through these in version order for version in schema_versions: schema = subject_data["schemas"][version] if schema["schema"] == new_schema: self.r({"id": schema["id"]}, content_type) else: self.log.debug("schema: %s did not match with: %s", schema, new_schema) compatibility = subject_data.get("compatibility", self.ksr.config["compatibility"]) # Run a compatibility check between on file schema(s) and the one being submitted now # the check is either towards the latest one or against all previous ones in case of # transitive mode if compatibility in TRANSITIVE_MODES: check_against = schema_versions else: check_against = [schema_versions[-1]] for old_version in check_against: old_schema = subject_data["schemas"][old_version]["schema"] compat = Compatibility(old_schema, new_schema, compatibility=compatibility) try: compat.check() except IncompatibleSchema as ex: self.log.warning("Incompatible schema: %s", ex) self.r(body={ "error_code": 409, "message": "Schema being registered is incompatible with an earlier schema" }, content_type=content_type, status=409) # We didn't find an existing schema and the schema is compatible so go and create one schema_id = self.ksr.get_schema_id(new_schema) version = max(self.ksr.subjects[subject]["schemas"]) + 1 self.log.info( "Registering subject: %r, id: %r new version: %r with schema %r, schema_id: %r", subject, schema_id, version, new_schema.to_json(), schema_id) self.send_schema_message( subject=subject, schema=new_schema, schema_id=schema_id, version=version, deleted=False, ) self.r({"id": schema_id}, content_type) async def forward_request_remote(self, *, body, url, content_type, method="POST"): self.log.info( "Writing new schema to remote url: %r since we're not the master", url) response = await self.http_request(url=url, method=method, json=body, timeout=60.0) self.r(body=response.body, content_type=content_type, status=response.status) def no_master_error(self, content_type): self.r( { "error_code": 50003, "message": "Error while forwarding the request to the master." }, content_type, status=500)
def init_admin(config): mc = MasterCoordinator(config=config) mc.start() return mc
class Karapace(RestApp): def __init__(self, config_path): super().__init__(app_name="Karapace") self.config = {} self.config_path = config_path self.log = logging.getLogger("Karapace") self.kafka_timeout = 10 self.route( "/compatibility/subjects/<subject:path>/versions/<version:path>", callback=self.compatibility_check, method="POST") self.route("/config/<subject:path>", callback=self.config_subject_get, method="GET") self.route("/config/<subject:path>", callback=self.config_subject_set, method="PUT") self.route("/config", callback=self.config_get, method="GET") self.route("/config", callback=self.config_set, method="PUT") self.route("/schemas/ids/<schema_id:path>", callback=self.schemas_get, method="GET") self.route("/subjects", callback=self.subjects_list, method="GET") self.route("/subjects/<subject:path>/versions", callback=self.subject_post, method="POST") self.route("/subjects/<subject:path>/versions", callback=self.subject_versions_list, method="GET") self.route("/subjects/<subject:path>/versions/<version:path>", callback=self.subject_version_get, method="GET") self.route("/subjects/<subject:path>/versions/<version:path>", callback=self.subject_version_delete, method="DELETE") self.route("/subjects/<subject:path>", callback=self.subject_delete, method="DELETE") self.ksr = None self.read_config() self._create_producer() self._create_schema_reader() self._create_master_coordinator() self.app.on_startup.append(self.create_http_client) self.master_lock = asyncio.Lock() self.log.info("Karapace initialized") def close(self): self.log.info("Shutting down all auxiliary threads") if self.mc: self.mc.close() if self.ksr: self.ksr.close() if self.producer: self.producer.close() def read_config(self): if os.path.exists(self.config_path): try: config = json.loads(open(self.config_path, "r").read()) self.config = set_config_defaults(config) try: logging.getLogger().setLevel(config["log_level"]) except ValueError: self.log.excption("Problem with log_level: %r", config["log_level"]) except Exception as ex: raise InvalidConfiguration(ex) else: raise InvalidConfiguration() def _create_schema_reader(self): self.ksr = KafkaSchemaReader(config=self.config, ) self.ksr.start() def _create_master_coordinator(self): self.mc = MasterCoordinator(config=self.config) self.mc.start() def _create_producer(self): self.producer = KafkaProducer( bootstrap_servers=self.config["bootstrap_uri"], security_protocol=self.config["security_protocol"], ssl_cafile=self.config["ssl_cafile"], ssl_certfile=self.config["ssl_certfile"], ssl_keyfile=self.config["ssl_keyfile"], api_version=(1, 0, 0), ) @staticmethod def r(body, status=200): raise HTTPResponse( body=body, status=status, content_type="application/vnd.schemaregistry.v1+json", headers={}, ) def get_offset_from_queue(self, sent_offset): start_time = time.monotonic() while True: self.log.info("Starting to wait for offset: %r from ksr queue", sent_offset) offset = self.ksr.queue.get() if offset == sent_offset: self.log.info( "We've consumed back produced offset: %r message back, everything is in sync, took: %.4f", offset, time.monotonic() - start_time) break elif offset != sent_offset: self.log.error( "Put the offset: %r back to queue, someone else is waiting for this?", offset) self.ksr.queue.put(offset) def send_kafka_message(self, key, value): if isinstance(key, str): key = key.encode("utf8") if isinstance(value, str): value = value.encode("utf8") future = self.producer.send(self.config["topic_name"], key=key, value=value) self.producer.flush(timeout=self.kafka_timeout) msg = future.get(self.kafka_timeout) self.log.warning("Sent kafka msg key: %r, value: %r, offset: %r", key, value, msg.offset) self.get_offset_from_queue(msg.offset) return future def send_schema_message(self, subject, parsed_schema_json, schema_id, version, deleted): key = '{{"subject":"{}","version":{},"magic":1,"keytype":"SCHEMA"}}'.format( subject, version) value = { "subject": subject, "version": version, "id": schema_id, "schema": json_encode(parsed_schema_json, compact=True), "deleted": deleted } return self.send_kafka_message(key, json_encode(value, compact=True)) def send_config_message(self, compatibility_level, subject=None): if subject is not None: key = '{{"subject":"{}","magic":0,"keytype":"CONFIG"}}'.format( subject) else: key = '{"subject":null,"magic":0,"keytype":"CONFIG"}' value = '{{"compatibilityLevel":"{}"}}'.format(compatibility_level) return self.send_kafka_message(key, value) def send_delete_subject_message(self, subject): key = '{{"subject":"{}","magic":0,"keytype":"DELETE_SUBJECT"}}'.format( subject) value = '{{"subject":"{}","version":2}}'.format(subject) return self.send_kafka_message(key, value) async def compatibility_check(self, *, subject, version, request): """Check for schema compatibility""" body = request.json self.log.info( "Got request to check subject: %r, version_id: %r compatibility", subject, version) old = await self.subject_version_get(subject=subject, version=version, return_dict=True) self.log.info("Existing schema: %r, new_schema: %r", old["schema"], body["schema"]) try: new = avro.schema.Parse(body["schema"]) except avro.schema.SchemaParseException: self.log.warning("Invalid schema: %r", body["schema"]) self.r(body={ "error_code": 44201, "message": "Invalid Avro schema" }, status=422) try: old_schema = avro.schema.Parse(json.loads(old["schema"])) except avro.schema.SchemaParseException: self.log.warning("Invalid existing schema: %r", old["schema"]) self.r(body={ "error_code": 44201, "message": "Invalid Avro schema" }, status=422) compat = Compatibility(source=old_schema, target=new, compatibility=old.get( "compatibility", self.ksr.config["compatibility"])) try: compat.check() except IncompatibleSchema as ex: self.log.warning( "Invalid schema %s found by compatibility check: old: %s new: %s", ex, old_schema, new) self.r({"is_compatible": False}) self.r({"is_compatible": True}) async def schemas_get(self, *, schema_id): schema = self.ksr.schemas.get(int(schema_id)) if not schema: self.log.warning("Schema: %r that was requested, not found", int(schema_id)) self.r(body={ "error_code": 40403, "message": "Schema not found" }, status=404) self.r({"schema": schema}) async def config_get(self): self.r({"compatibilityLevel": self.ksr.config["compatibility"]}) async def config_set(self, *, request): if "compatibility" in request.json and request.json[ "compatibility"] in COMPATIBILITY_MODES: compatibility_level = request.json["compatibility"] self.send_config_message(compatibility_level=compatibility_level, subject=None) else: self.r(body={ "error_code": 42203, "message": "Invalid compatibility level. Valid values are none, backward, forward and full" }, status=422) self.r({"compatibility": self.ksr.config["compatibility"]}) async def config_subject_get(self, *, subject): subject_data = self.ksr.subjects.get(subject) if not subject_data: self.r({"error_code": 40401, "message": "no subject"}, status=404) if "compatibility" in subject_data: self.r({"compatibilityLevel": subject_data["compatibility"]}) self.r({ "error_code": 40401, "message": "Subject not found." }, status=404) async def config_subject_set(self, *, request, subject): subject_data = self.ksr.subjects.get(subject) if not subject_data: self.r({"error_code": 40401, "message": "no subject"}, status=404) if "compatibility" in request.json and request.json[ "compatibility"] in COMPATIBILITY_MODES: self.send_config_message( compatibility_level=request.json["compatibility"], subject=subject) else: self.r(body={ "error_code": 42203, "message": "Invalid compatibility level" }, status=422) self.r({"compatibility": request.json["compatibility"]}) async def subjects_list(self): self.r(list(self.ksr.subjects.keys())) async def subject_delete(self, *, subject): subject_data = self.ksr.subjects.get(subject, {}) if not subject_data: self.r({ "error_code": 40401, "message": "subject does not exist" }, status=404) self.send_delete_subject_message(subject) self.r(list(subject_data["schemas"]), status=200) async def subject_version_get(self, *, subject, version, return_dict=False): if version != "latest" and int(version) < 1: self.r( { "error_code": 42202, "message": 'The specified version is not a valid version id. ' 'Allowed values are between [1, 2^31-1] and the string "latest"' }, status=422) subject_data = self.ksr.subjects.get(subject) if not subject_data: self.r({"error_code": 40401, "message": "no subject"}, status=404) max_version = max(subject_data["schemas"]) if version == "latest": schema = subject_data["schemas"][max(subject_data["schemas"])] version = max(subject_data["schemas"]) elif int(version) <= max_version: schema = subject_data["schemas"].get(int(version)) else: self.r({ "error_code": 40402, "message": "Version not found." }, status=404) schema_string = schema["schema"] schema_id = schema["id"] ret = { "subject": subject, "version": int(version), "id": schema_id, "schema": json_encode(schema_string, compact=True) } if return_dict: # Return also compatibility information to compatibility check if subject_data.get("compatibility"): ret["compatibility"] = subject_data.get("compatibility") return ret self.r(ret) async def subject_version_delete(self, *, subject, version): version = int(version) subject_data = self.ksr.subjects.get(subject) if not subject_data: self.r({ "error_code": 40401, "message": "subject not found" }, status=404) schema = subject_data["schemas"].get(version, None) if not schema: self.r({ "error_code": 40402, "message": "Version not found." }, status=404) schema_id = schema["id"] self.send_schema_message(subject, schema, schema_id, version, deleted=True) self.r(str(version), status=200) async def subject_versions_list(self, *, subject): subject_data = self.ksr.subjects.get(subject) if not subject_data: self.r({ "error_code": 40401, "message": "subject not found" }, status=404) self.r(list(subject_data["schemas"]), status=200) async def get_master(self): async with self.master_lock: while True: master, master_url = self.mc.get_master_info() if master is None: self.log.info("No master set: %r, url: %r", master, master_url) elif self.ksr.ready is False: self.log.info("Schema reader isn't ready yet: %r", self.ksr.ready) else: return master, master_url await asyncio.sleep(1.0) async def subject_post(self, *, subject, request): body = request.json self.log.debug("POST with subject: %r, request: %r", subject, body) are_we_master, master_url = await self.get_master() if are_we_master: self.write_new_schema_local(subject, body) elif are_we_master is None: self.r( { "error_code": 50003, "message": "Error while forwarding the request to the master." }, status=500) else: await self.write_new_schema_remote(subject, body, master_url) def write_new_schema_local(self, subject, body): """Since we're the master we get to write the new schema""" self.log.info("Writing new schema locally since we're the master") try: new_schema = avro.schema.Parse(body["schema"]) except avro.schema.SchemaParseException: self.log.warning("Invalid schema: %r", body["schema"]) self.r(body={ "error_code": 44201, "message": "Invalid Avro schema" }, status=422) if subject not in self.ksr.subjects: schema_id = self.ksr.get_new_schema_id() version = 1 self.log.info( "Registering new subject: %r with version: %r to schema %r, schema_id: %r", subject, version, new_schema.to_json(), schema_id) else: # First check if any of the existing schemas for the subject match subject_data = self.ksr.subjects[subject] schema_versions = sorted(list(subject_data["schemas"])) # Go through these in version order for version in schema_versions: schema = subject_data["schemas"][version] parsed_version_schema = avro.schema.Parse(schema["schema"]) if parsed_version_schema == new_schema: self.r({"id": schema["id"]}) else: self.log.debug("schema: %s did not match with: %s", schema["schema"], new_schema.to_json()) # Run a compatibility check between the newest on file schema and the one being submitted now latest_schema = subject_data["schemas"][schema_versions[-1]] old_schema = avro.schema.Parse(latest_schema["schema"]) compat = Compatibility(old_schema, new_schema, compatibility=subject_data.get( "compatibility", self.ksr.config["compatibility"])) try: compat.check() except IncompatibleSchema as ex: self.log.warning("Incompatible schema: %s", ex) self.r(body={ "error_code": 409, "message": "Schema being registered is incompatible with an earlier schema" }, status=409) # We didn't find an existing schema, so go and create one schema_id = self.ksr.get_new_schema_id() version = max(self.ksr.subjects[subject]["schemas"]) + 1 self.log.info( "Registering subject: %r, id: %r new version: %r with schema %r, schema_id: %r", subject, schema_id, version, new_schema.to_json(), schema_id) self.send_schema_message(subject, new_schema.to_json(), schema_id, version, deleted=False) self.r({"id": schema_id}) async def write_new_schema_remote(self, subject, body, master_url): self.log.info( "Writing new schema to remote url: %r since we're not the master", master_url) response = await self.http_request( url="{}/subjects/{}/versions".format(master_url, subject), method="POST", json=body, timeout=60.0) self.r(body=response.body, status=response.status)
class Karapace(RestApp): def __init__(self, config_path): self.config = {} self.config_path = config_path self.config = self.read_config(self.config_path) self._sentry_config = self.config.get("sentry", {"dsn": None}).copy() if os.environ.get("SENTRY_DSN"): self._sentry_config["dsn"] = os.environ["SENTRY_DSN"] if "tags" not in self._sentry_config: self._sentry_config["tags"] = {} self._sentry_config["tags"]["app"] = "Karapace" super().__init__(app_name="Karapace", sentry_config=self._sentry_config) self.log = logging.getLogger("Karapace") self.kafka_timeout = 10 self.route( "/compatibility/subjects/<subject:path>/versions/<version:path>", callback=self.compatibility_check, method="POST", schema_request=True) self.route("/", callback=self.root_get, method="GET") self.route("/config/<subject:path>", callback=self.config_subject_get, method="GET", schema_request=True) self.route("/config/<subject:path>", callback=self.config_subject_set, method="PUT", schema_request=True) self.route("/config", callback=self.config_get, method="GET", schema_request=True) self.route("/config", callback=self.config_set, method="PUT", schema_request=True) self.route("/schemas/ids/<schema_id:path>", callback=self.schemas_get, method="GET", schema_request=True) self.route("/subjects", callback=self.subjects_list, method="GET", schema_request=True) self.route("/subjects/<subject:path>/versions", callback=self.subject_post, method="POST", schema_request=True) self.route("/subjects/<subject:path>", callback=self.subjects_schema_post, method="POST", schema_request=True) self.route("/subjects/<subject:path>/versions", callback=self.subject_versions_list, method="GET", schema_request=True) self.route("/subjects/<subject:path>/versions/<version>", callback=self.subject_version_get, method="GET", schema_request=True) self.route("/subjects/<subject:path>/versions/<version:path>", callback=self.subject_version_delete, method="DELETE", schema_request=True) self.route("/subjects/<subject:path>/versions/<version>/schema", callback=self.subject_version_schema_get, method="GET", schema_request=True) self.route("/subjects/<subject:path>", callback=self.subject_delete, method="DELETE", schema_request=True) self.ksr = None self._set_log_level() self._create_producer() self._create_schema_reader() self._create_master_coordinator() self.app.on_startup.append(self.create_http_client) self.master_lock = asyncio.Lock() self.log.info("Karapace initialized") def close(self): self.log.info("Shutting down all auxiliary threads") if self.mc: self.mc.close() if self.ksr: self.ksr.close() if self.producer: self.producer.close() @staticmethod def read_config(config_path): if os.path.exists(config_path): try: config = json.loads(open(config_path, "r").read()) config = set_config_defaults(config) return config except Exception as ex: raise InvalidConfiguration(ex) else: raise InvalidConfiguration() def _set_log_level(self): try: logging.getLogger().setLevel(self.config["log_level"]) except ValueError: self.log.excption("Problem with log_level: %r", self.config["log_level"]) def _create_schema_reader(self): self.ksr = KafkaSchemaReader(config=self.config, ) self.ksr.start() def _create_master_coordinator(self): self.mc = MasterCoordinator(config=self.config) self.mc.start() def _create_producer(self): self.producer = KafkaProducer( bootstrap_servers=self.config["bootstrap_uri"], security_protocol=self.config["security_protocol"], ssl_cafile=self.config["ssl_cafile"], ssl_certfile=self.config["ssl_certfile"], ssl_keyfile=self.config["ssl_keyfile"], api_version=(1, 0, 0), metadata_max_age_ms=self.config["metadata_max_age_ms"], ) def _subject_get(self, subject, content_type): subject_data = self.ksr.subjects.get(subject) if not subject_data: self.r({ "error_code": 40401, "message": "Subject not found." }, content_type, status=404) schemas = self.ksr.get_schemas(subject) if not schemas: self.r({ "error_code": 40401, "message": "Subject not found." }, content_type, status=404) subject_data["schemas"] = schemas return subject_data def _validate_version(self, content_type, version): # pylint: disable=inconsistent-return-statements try: version_number = int(version) if version_number > 0: return version except ValueError: if version == "latest": return version self.r(body={ "error_code": 42202, "message": "The specified version is not a valid version id. " "Allowed values are between [1, 2^31-1] and the string \"latest\"" }, content_type=content_type, status=422) @staticmethod def r(body, content_type, status=200): raise HTTPResponse( body=body, status=status, content_type=content_type, headers={}, ) def get_offset_from_queue(self, sent_offset): start_time = time.monotonic() while True: self.log.info("Starting to wait for offset: %r from ksr queue", sent_offset) offset = self.ksr.queue.get() if offset == sent_offset: self.log.info( "We've consumed back produced offset: %r message back, everything is in sync, took: %.4f", offset, time.monotonic() - start_time) break self.log.error( "Put the offset: %r back to queue, someone else is waiting for this?", offset) self.ksr.queue.put(offset) def send_kafka_message(self, key, value): if isinstance(key, str): key = key.encode("utf8") if isinstance(value, str): value = value.encode("utf8") future = self.producer.send(self.config["topic_name"], key=key, value=value) self.producer.flush(timeout=self.kafka_timeout) msg = future.get(self.kafka_timeout) self.log.debug("Sent kafka msg key: %r, value: %r, offset: %r", key, value, msg.offset) self.get_offset_from_queue(msg.offset) return future def send_schema_message(self, subject, parsed_schema_json, schema_id, version, deleted): key = '{{"subject":"{}","version":{},"magic":1,"keytype":"SCHEMA"}}'.format( subject, version) value = { "subject": subject, "version": version, "id": schema_id, "schema": json_encode(parsed_schema_json, compact=True), "deleted": deleted } return self.send_kafka_message(key, json_encode(value, compact=True)) def send_config_message(self, compatibility_level, subject=None): if subject is not None: key = '{{"subject":"{}","magic":0,"keytype":"CONFIG"}}'.format( subject) else: key = '{"subject":null,"magic":0,"keytype":"CONFIG"}' value = '{{"compatibilityLevel":"{}"}}'.format(compatibility_level) return self.send_kafka_message(key, value) def send_delete_subject_message(self, subject, version): key = '{{"subject":"{}","magic":0,"keytype":"DELETE_SUBJECT"}}'.format( subject) value = '{{"subject":"{}","version":{}}}'.format(subject, version) return self.send_kafka_message(key, value) async def root_get(self): self.r({}, "application/json") async def compatibility_check(self, content_type, *, subject, version, request): """Check for schema compatibility""" body = request.json self.log.info( "Got request to check subject: %r, version_id: %r compatibility", subject, version) old = await self.subject_version_get(content_type=content_type, subject=subject, version=version, return_dict=True) self.log.info("Existing schema: %r, new_schema: %r", old["schema"], body["schema"]) try: new = avro.schema.Parse(body["schema"]) except avro.schema.SchemaParseException: self.log.warning("Invalid schema: %r", body["schema"]) self.r(body={ "error_code": 44201, "message": "Invalid Avro schema" }, content_type=content_type, status=422) try: old_schema = avro.schema.Parse(old["schema"]) except avro.schema.SchemaParseException: self.log.warning("Invalid existing schema: %r", old["schema"]) self.r(body={ "error_code": 44201, "message": "Invalid Avro schema" }, content_type=content_type, status=422) compat = Compatibility(source=old_schema, target=new, compatibility=old.get( "compatibility", self.ksr.config["compatibility"])) try: compat.check() except IncompatibleSchema as ex: self.log.warning( "Invalid schema %s found by compatibility check: old: %s new: %s", ex, old_schema, new) self.r({"is_compatible": False}, content_type) self.r({"is_compatible": True}, content_type) async def schemas_get(self, content_type, *, schema_id): try: schema_id_int = int(schema_id) except ValueError: self.r({ "error_code": 404, "message": "HTTP 404 Not Found" }, content_type, status=404) schema = self.ksr.schemas.get(schema_id_int) if not schema: self.log.warning("Schema: %r that was requested, not found", int(schema_id)) self.r(body={ "error_code": 40403, "message": "Schema not found" }, content_type=content_type, status=404) self.r({"schema": schema}, content_type) async def config_get(self, content_type): self.r({"compatibilityLevel": self.ksr.config["compatibility"]}, content_type) async def config_set(self, content_type, *, request): if "compatibility" in request.json and request.json[ "compatibility"] in COMPATIBILITY_MODES: compatibility_level = request.json["compatibility"] self.send_config_message(compatibility_level=compatibility_level, subject=None) else: self.r(body={ "error_code": 42203, "message": "Invalid compatibility level. Valid values are none, backward, forward and full" }, content_type=content_type, status=422) self.r({"compatibility": self.ksr.config["compatibility"]}, content_type) async def config_subject_get(self, content_type, *, subject): # Config for a subject can exist without schemas so no need to check for their existence subject_data = self.ksr.subjects.get(subject) if "compatibility" in subject_data: self.r({"compatibilityLevel": subject_data["compatibility"]}, content_type) self.r({ "error_code": 40401, "message": "Subject not found." }, content_type, status=404) async def config_subject_set(self, content_type, *, request, subject): if "compatibility" in request.json and request.json[ "compatibility"] in COMPATIBILITY_MODES: self.send_config_message( compatibility_level=request.json["compatibility"], subject=subject) else: self.r(body={ "error_code": 42203, "message": "Invalid compatibility level" }, content_type=content_type, status=422) self.r({"compatibility": request.json["compatibility"]}, content_type) async def subjects_list(self, content_type): subjects_list = [ key for key, val in self.ksr.subjects.items() if self.ksr.get_schemas(key) ] self.r(subjects_list, content_type, status=200) async def subject_delete(self, content_type, *, subject): self._subject_get(subject, content_type) version_list = list(self.ksr.get_schemas(subject)) if version_list: latest_schema_id = version_list[-1] else: latest_schema_id = 0 self.send_delete_subject_message(subject, latest_schema_id) self.r(version_list, content_type, status=200) async def subject_version_get(self, content_type, *, subject, version, return_dict=False): self._validate_version(content_type, version) subject_data = self._subject_get(subject, content_type) max_version = max(subject_data["schemas"]) if version == "latest": schema = subject_data["schemas"][max(subject_data["schemas"])] version = max(subject_data["schemas"]) elif int(version) <= max_version: schema = subject_data["schemas"].get(int(version)) else: self.r({ "error_code": 40402, "message": "Version not found." }, content_type, status=404) schema_string = schema["schema"] schema_id = schema["id"] ret = { "subject": subject, "version": int(version), "id": schema_id, "schema": schema_string, } if return_dict: # Return also compatibility information to compatibility check if subject_data.get("compatibility"): ret["compatibility"] = subject_data.get("compatibility") return ret self.r(ret, content_type) async def subject_version_delete(self, content_type, *, subject, version): version = int(version) subject_data = self._subject_get(subject, content_type) schema = subject_data["schemas"].get(version, None) if not schema: self.r({ "error_code": 40402, "message": "Version not found." }, content_type, status=404) schema_id = schema["id"] self.send_schema_message(subject, schema, schema_id, version, deleted=True) self.r(str(version), content_type, status=200) async def subject_version_schema_get(self, content_type, *, subject, version): self._validate_version(content_type, version) subject_data = self._subject_get(subject, content_type) max_version = max(subject_data["schemas"]) if version == "latest": schema_data = subject_data["schemas"][max_version] elif int(version) <= max_version: schema_data = subject_data["schemas"].get(int(version)) else: self.r({ "error_code": 40402, "message": "Version not found." }, content_type, status=404) self.r(schema_data["schema"], content_type) async def subject_versions_list(self, content_type, *, subject): subject_data = self._subject_get(subject, content_type) self.r(list(subject_data["schemas"]), content_type, status=200) async def get_master(self): async with self.master_lock: while True: master, master_url = self.mc.get_master_info() if master is None: self.log.info("No master set: %r, url: %r", master, master_url) elif self.ksr.ready is False: self.log.info("Schema reader isn't ready yet: %r", self.ksr.ready) else: return master, master_url await asyncio.sleep(1.0) def _validate_schema_request_body(self, content_type, body): if not isinstance(body, dict): self.r({ "error_code": 500, "message": "Internal Server Error" }, content_type, status=500) for field in body: if field != "schema": self.r(body={ "error_code": 422, "message": f"Unrecognized field: {field}", }, content_type=content_type, status=422) async def subjects_schema_post(self, content_type, *, subject, request): body = request.json self._validate_schema_request_body(content_type, body) subject_data = self._subject_get(subject, content_type) if "schema" not in body: self.r({ "error_code": 500, "message": "Internal Server Error" }, content_type, status=500) try: new_schema = avro.schema.Parse(body["schema"]) except avro.schema.SchemaParseException: self.r(body={ "error_code": 500, "message": f"Error while looking up schema under subject {subject}" }, content_type=content_type, status=500) new_schema_encoded = json_encode(new_schema.to_json(), compact=True) for schema in subject_data["schemas"].values(): if schema["schema"] == new_schema_encoded: ret = { "subject": subject, "version": schema["version"], "id": schema["id"], "schema": schema["schema"], } self.r(ret, content_type) self.r({ "error_code": 40403, "message": "Schema not found" }, content_type, status=404) async def subject_post(self, content_type, *, subject, request): body = request.json self.log.debug("POST with subject: %r, request: %r", subject, body) self._validate_schema_request_body(content_type, body) if "schema" not in body: self.r({ "error_code": 500, "message": "Internal Server Error" }, content_type, status=500) are_we_master, master_url = await self.get_master() if are_we_master: self.write_new_schema_local(subject, body, content_type) elif are_we_master is None: self.r( { "error_code": 50003, "message": "Error while forwarding the request to the master." }, content_type, status=500) else: await self.write_new_schema_remote(subject, body, master_url, content_type) def write_new_schema_local(self, subject, body, content_type): """Since we're the master we get to write the new schema""" self.log.info("Writing new schema locally since we're the master") try: new_schema = avro.schema.Parse(body["schema"]) except avro.schema.SchemaParseException: self.log.warning("Invalid schema: %r", body["schema"]) self.r(body={ "error_code": 44201, "message": "Invalid Avro schema" }, content_type=content_type, status=422) if subject not in self.ksr.subjects or not self.ksr.subjects.get( subject)["schemas"]: schema_id = self.ksr.get_schema_id(new_schema) version = 1 self.log.info( "Registering new subject: %r with version: %r to schema %r, schema_id: %r", subject, version, new_schema.to_json(), schema_id) else: # First check if any of the existing schemas for the subject match subject_data = self.ksr.subjects[subject] schemas = self.ksr.get_schemas(subject) if not schemas: # Previous ones have been deleted by the user. version = max(self.ksr.subjects[subject]["schemas"]) + 1 schema_id = self.ksr.get_schema_id(new_schema) self.log.info( "Registering subject: %r, id: %r new version: %r with schema %r, schema_id: %r", subject, schema_id, version, new_schema.to_json(), schema_id) self.send_schema_message(subject, new_schema.to_json(), schema_id, version, deleted=False) self.r({"id": schema_id}, content_type) schema_versions = sorted(list(schemas)) # Go through these in version order for version in schema_versions: schema = subject_data["schemas"][version] parsed_version_schema = avro.schema.Parse(schema["schema"]) if parsed_version_schema == new_schema: self.r({"id": schema["id"]}, content_type) else: self.log.debug("schema: %s did not match with: %s", schema["schema"], new_schema.to_json()) compatibility = subject_data.get("compatibility", self.ksr.config["compatibility"]) # Run a compatibility check between on file schema(s) and the one being submitted now # the check is either towards the latest one or against all previous ones in case of # transitive mode if compatibility in TRANSITIVE_MODES: check_against = schema_versions else: check_against = [schema_versions[-1]] for old_version in check_against: old_schema = avro.schema.Parse( subject_data["schemas"][old_version]["schema"]) compat = Compatibility(old_schema, new_schema, compatibility=compatibility) try: compat.check() except IncompatibleSchema as ex: self.log.warning("Incompatible schema: %s", ex) self.r(body={ "error_code": 409, "message": "Schema being registered is incompatible with an earlier schema" }, content_type=content_type, status=409) # We didn't find an existing schema and the schema is compatible so go and create one schema_id = self.ksr.get_schema_id(new_schema) version = max(self.ksr.subjects[subject]["schemas"]) + 1 self.log.info( "Registering subject: %r, id: %r new version: %r with schema %r, schema_id: %r", subject, schema_id, version, new_schema.to_json(), schema_id) self.send_schema_message(subject, new_schema.to_json(), schema_id, version, deleted=False) self.r({"id": schema_id}, content_type) async def write_new_schema_remote(self, subject, body, master_url, content_type): self.log.info( "Writing new schema to remote url: %r since we're not the master", master_url) response = await self.http_request( url="{}/subjects/{}/versions".format(master_url, subject), method="POST", json=body, timeout=60.0) self.r(body=response.body, content_type=content_type, status=response.status)