def __init__(self, linking_service: LinkingService, metric_provider: Type[MetricProvider]): self.applications: Dict[str, K8sApp] = {} self.kafka_connectors: List[KafkaConnector] = [] self.data_flow = DataFlowGraph(metric_provider=metric_provider) self.linking_service = linking_service self.kafka = Kafka()
async def test_get_positioned_json_graph(self, df: DataFlowGraph): df.add_streaming_app(K8sApp.factory(get_streaming_app_deployment())) await df.get_positioned_graph() nodes = df.graph.nodes(data=True) for _, data in iter(nodes): assert data.get("x") is not None assert data.get("y") is not None
def test_get_positioned_json_graph(self, df: DataFlowGraph): df.add_streaming_app(self.get_k8s_app()) df.get_positioned_graph() nodes = df.graph.nodes(data=True) for _, data in nodes: assert data.get("x") is not None assert data.get("y") is not None
def test_pipeline_cronjob(self, df: DataFlowGraph): df.add_streaming_app( K8sApp.factory( get_streaming_app_cronjob( error_topic=None, pipeline="pipeline1", ) ) ) df.add_streaming_app( K8sApp.factory( get_streaming_app_deployment( input_topics="output-topic", error_topic=None, output_topic="output-topic2", pipeline="pipeline1", ) ) ) assert len(df.pipelines) == 1 assert "pipeline1" in df.pipelines pipeline1 = df.pipelines["pipeline1"] assert set(pipeline1.nodes) == { "test-cronjob", "output-topic", "test-app", "output-topic2", }
def test_add_streaming_app(self, df: DataFlowGraph): df.add_streaming_app(self.get_k8s_app()) assert len(df.graph.nodes) == 4 assert df.graph.has_edge("input-topic", "test-app") assert df.graph.has_edge("test-app", "output-topic") assert df.graph.has_edge("test-app", "error-topic") # should have multiple input topic df.reset() df.add_streaming_app(self.get_k8s_app(input_topics="input-topic1,input-topic2")) assert len(df.graph.nodes) == 5 assert df.graph.has_edge("input-topic1", "test-app") assert df.graph.has_edge("input-topic2", "test-app") assert df.graph.has_edge("test-app", "output-topic") assert df.graph.has_edge("test-app", "error-topic") df.reset() df.add_streaming_app( self.get_k8s_app(multiple_outputs="1=extra-output1,2=extra-output2") ) assert len(df.graph.nodes) == 6 assert df.graph.has_edge("input-topic", "test-app") assert df.graph.has_edge("test-app", "output-topic") assert df.graph.has_edge("test-app", "error-topic") assert df.graph.has_edge("test-app", "extra-output1") assert df.graph.has_edge("test-app", "extra-output2")
def test_add_source(self, df: DataFlowGraph): source = Source( name="test-source", node_type="test-type", target="test-app", ) df.add_streaming_app(self.get_k8s_app()) df.add_source(source) assert len(df.graph.nodes) == 5 assert df.graph.has_edge("test-source", "test-app")
def test_add_sink(self, df: DataFlowGraph): sink = Sink( name="test-sink", node_type="test-type", source="test-app", ) df.add_streaming_app(self.get_k8s_app()) df.add_sink(sink) assert len(df.graph.nodes) == 5 assert df.graph.has_edge("test-app", "test-sink")
def test_add_source(self, df: DataFlowGraph): source = Source( name="test-source", node_type="test-type", target="test-app", ) df.add_streaming_app(K8sApp.factory(get_streaming_app_deployment())) df.add_source(source) assert len(df.graph.nodes) == 5 assert df.graph.has_edge("test-source", "test-app") assert len(df.pipelines) == 0
def test_node_attributes(self, df: DataFlowGraph): df.add_streaming_app( self.get_k8s_app( name="test-app1", pipeline="pipeline1", ) ) assert df.graph.nodes["test-app1"].get("pipeline") == "pipeline1" df.add_streaming_app( self.get_k8s_app( name="test-app2", pipeline=None, ) ) assert df.graph.nodes["test-app2"].get("pipeline") is None
def test_node_attributes(self, df: DataFlowGraph): df.add_streaming_app( K8sApp.factory( get_streaming_app_deployment( name="test-app1", pipeline="pipeline1", ) ) ) assert df.graph.nodes["test-app1"].get(ATTR_PIPELINE) == "pipeline1" df.add_streaming_app( K8sApp.factory( get_streaming_app_deployment( name="test-app2", pipeline=None, ) ) ) assert df.graph.nodes["test-app2"].get(ATTR_PIPELINE) is None
def test_pipeline_graph(self, df: DataFlowGraph): df.add_streaming_app( K8sApp.factory( get_streaming_app_deployment( pipeline="pipeline1", ) ) ) df.add_streaming_app( K8sApp.factory( get_streaming_app_deployment( name="test-app2", input_topics="input-topic2", error_topic="error-topic2", output_topic="output-topic2", pipeline="pipeline2", multiple_inputs="0=output-topic", ) ) ) assert len(df.pipelines) == 2 assert "pipeline1" in df.pipelines assert "pipeline2" in df.pipelines pipeline1 = df.pipelines["pipeline1"] pipeline2 = df.pipelines["pipeline2"] assert set(pipeline1.nodes) == { "input-topic", "output-topic", "test-app", "error-topic", } assert set(pipeline2.nodes) == { "output-topic", "input-topic2", "output-topic2", "test-app2", "error-topic2", } df.add_sink(Sink("test-sink", "output-topic")) assert "test-sink" in pipeline1.nodes source_connector = KafkaConnector( name="test-source-connector", type=KafkaConnectorTypesEnum.SOURCE, topics=["input-topic2", "source-topic"], config={}, ) df.add_connector(source_connector) assert "test-source-connector" in pipeline2.nodes assert "source-topic" in pipeline2.nodes
def test_multiple_pipelines_apps(self, df: DataFlowGraph): """Ensures apps have separate pipelines despite them being connected.""" df.add_streaming_app( K8sApp.factory( get_streaming_app_deployment( name="test-app1", input_topics="input-topic1", error_topic=None, output_topic="output-topic1", pipeline="pipeline1", ) ) ) df.add_streaming_app( K8sApp.factory( get_streaming_app_deployment( name="test-app2", input_topics="output-topic1", error_topic=None, output_topic="output-topic2", pipeline="pipeline2", ) ) ) assert len(df.pipelines) == 2 assert "pipeline1" in df.pipelines assert "pipeline2" in df.pipelines pipeline1 = df.pipelines["pipeline1"] pipeline2 = df.pipelines["pipeline2"] assert set(pipeline1.nodes) == { "test-app1", "input-topic1", "output-topic1", } assert set(pipeline2.nodes) == { "test-app2", "output-topic1", "output-topic2", }
def test_extract_independent_pipelines(self, df: DataFlowGraph): settings.k8s.independent_graph.label = "pipeline" df.add_streaming_app(self.get_k8s_app()) df.add_streaming_app( self.get_k8s_app( name="type2-app2", input_topics="input-topic2", error_topic="error-topic2", output_topic="output-topic2", pipeline="pipeline2", ) ) df.extract_independent_pipelines() assert "test-app" in df.independent_graphs assert "pipeline2" in df.independent_graphs df.graph.add_node("test-node") df.extract_independent_pipelines() assert "test-node" in df.independent_graphs
def test_add_connector(self, df: DataFlowGraph): sink_connector = KafkaConnector( name="test-sink-connector", type=KafkaConnectorTypesEnum.SINK, topics=["output-topic"], config={}, error_topic="dead-letter-topic", ) source_connector = KafkaConnector( name="test-source-connector", type=KafkaConnectorTypesEnum.SOURCE, topics=["input-topic", "input-topic2"], config={}, ) df.add_streaming_app(self.get_k8s_app()) df.add_connector(sink_connector) df.add_connector(source_connector) assert len(df.graph.nodes) == 8 assert df.graph.has_edge("output-topic", "test-sink-connector") assert df.graph.has_edge("test-sink-connector", "dead-letter-topic") assert df.graph.has_edge("test-source-connector", "input-topic") assert df.graph.has_edge("test-source-connector", "input-topic2")
class StreamsExplorer: context = settings.k8s.deployment.context namespaces = settings.k8s.deployment.namespaces def __init__(self, linking_service: LinkingService, metric_provider: Type[MetricProvider]): self.applications: Dict[str, K8sApp] = {} self.kafka_connectors: List[KafkaConnector] = [] self.data_flow = DataFlowGraph(metric_provider=metric_provider) self.linking_service = linking_service self.kafka = Kafka() def setup(self): self.__setup_k8s_environment() async def update(self): self.applications = {} self.kafka_connectors = [] extractor_container.reset() self.data_flow.reset() self.__retrieve_deployments() self.__retrieve_cron_jobs() self.__get_connectors() self.__create_graph() self.data_flow.setup_metric_provider() await self.data_flow.store_json_graph() def get_positioned_json_graph(self) -> dict: return self.data_flow.json_graph async def get_positioned_pipeline_json_graph( self, pipeline_name: str) -> Optional[dict]: return await self.data_flow.get_positioned_pipeline_graph(pipeline_name ) def get_pipeline_names(self) -> List[str]: return list(self.data_flow.pipelines.keys()) async def get_metrics(self) -> List[Metric]: return await self.data_flow.get_metrics() @ttl_cache(ttl=settings.node_info.cache_ttl) def get_node_information(self, node_id: str): node_type = self.data_flow.get_node_type(node_id) if node_type == NodeTypesEnum.CONNECTOR: config = KafkaConnect.get_connector_config(node_id) return NodeInformation( node_id=node_id, node_type=node_type, info=self.linking_service.connector_info + get_displayed_information_connector(config), ) elif node_type == NodeTypesEnum.TOPIC or node_type == NodeTypesEnum.ERROR_TOPIC: info = self.linking_service.topic_info if self.kafka.enabled: partitions = self.kafka.get_topic_partitions(node_id) if partitions is not None: info.append( NodeInfoListItem( name="Partitions", value=len(partitions), type=NodeInfoType.BASIC, )) config = self.kafka.get_topic_config(node_id) info += get_displayed_information_topic(config) info.append( NodeInfoListItem( name="Schema", value={}, type=NodeInfoType.JSON, )) return NodeInformation( node_id=node_id, node_type=node_type, info=info, ) elif node_type == NodeTypesEnum.STREAMING_APP: info = get_displayed_information_deployment( self.applications[node_id]) return NodeInformation( node_id=node_id, node_type=node_type, info=self.linking_service.streaming_app_info + info, ) elif node_type in self.linking_service.sink_source_info: return NodeInformation( node_id=node_id, node_type=NodeTypesEnum.SINK_SOURCE, info=self.linking_service.sink_source_info[node_type], ) def get_link(self, node_id: str, link_type: Optional[str]): node_type = self.data_flow.get_node_type(node_id) if node_type == NodeTypesEnum.CONNECTOR: config = KafkaConnect.get_connector_config(node_id) return self.linking_service.get_redirect_connector( config, link_type) if node_type == NodeTypesEnum.TOPIC or node_type == NodeTypesEnum.ERROR_TOPIC: return self.linking_service.get_redirect_topic(node_id, link_type) if node_type == NodeTypesEnum.STREAMING_APP: return self.linking_service.get_redirect_streaming_app( self.applications[node_id], link_type) if node_type in self.linking_service.sink_source_redirects: return self.linking_service.get_sink_source_redirects( node_type, node_id) def __setup_k8s_environment(self): try: if settings.k8s.deployment.cluster: logger.info("Setup K8s environment in cluster") kubernetes.config.load_incluster_config() else: logger.info("Setup K8s environment") kubernetes.config.load_kube_config(context=self.context) except kubernetes.config.ConfigException: raise Exception("Could not load K8s environment configuration") self.k8s_app_client = kubernetes.client.AppsV1Api() self.k8s_batch_client = kubernetes.client.BatchV1beta1Api() def __retrieve_deployments(self): items: List[K8sObject] = [] items += self.get_deployments() items += self.get_stateful_sets() for item in items: try: app = K8sApp.factory(item) self.__add_app(app) except Exception as e: logger.debug(e) def get_deployments(self) -> List[V1Deployment]: deployments: List[V1Deployment] = [] for namespace in self.namespaces: logger.info(f"List deployments in namespace {namespace}") deployments += self.k8s_app_client.list_namespaced_deployment( namespace=namespace, watch=False).items return deployments def get_stateful_sets(self) -> List[V1StatefulSet]: stateful_sets: List[V1StatefulSet] = [] for namespace in self.namespaces: logger.info(f"List statefulsets in namespace {namespace}") stateful_sets += self.k8s_app_client.list_namespaced_stateful_set( namespace=namespace, watch=False).items return stateful_sets def __retrieve_cron_jobs(self): logger.info("Retrieve cronjob descriptions") cron_jobs = self.get_cron_jobs() for cron_job in cron_jobs: if app := extractor_container.on_cron_job(cron_job): self.__add_app(app)
class StreamsExplorer: context = settings.k8s.deployment.context namespaces = settings.k8s.deployment.namespaces def __init__(self, linking_service: LinkingService, metric_provider: Type[MetricProvider]): self.applications: Dict[str, K8sApp] = {} self.kafka_connectors: List[KafkaConnector] = [] self.data_flow = DataFlowGraph(metric_provider=metric_provider) self.linking_service = linking_service def setup(self): self.__setup_k8s_environment() def update(self): self.applications = {} self.kafka_connectors = [] extractor_container.reset() self.data_flow.reset() self.__retrieve_deployments() self.__retrieve_cron_jobs() self.__get_connectors() self.__create_graph() def get_positioned_json_graph(self) -> dict: return self.data_flow.get_positioned_graph() def get_positioned_pipeline_json_graph(self, pipeline_name) -> dict: return self.data_flow.get_positioned_pipeline_graph(pipeline_name) def get_pipeline_names(self) -> List[str]: return list(self.data_flow.independent_graphs.keys()) def get_metrics(self) -> List: return self.data_flow.get_metrics() def get_node_information(self, node_id: str): node_type = self.data_flow.get_node_type(node_id) if node_type == NodeTypesEnum.CONNECTOR: config = KafkaConnect.get_connector_config(node_id) return NodeInformation( node_id=node_id, node_type=node_type, info=self.linking_service.connector_info + get_displayed_information_connector(config), ) if node_type == NodeTypesEnum.TOPIC or node_type == NodeTypesEnum.ERROR_TOPIC: return NodeInformation( node_id=node_id, node_type=node_type, info=self.linking_service.topic_info + [ NodeInfoListItem( name="Schema", value=SchemaRegistry.get_newest_topic_value_schema( node_id), type=NodeInfoType.JSON, ) ], ) if node_type == NodeTypesEnum.STREAMING_APP: info = get_displayed_information_deployment( self.applications[node_id]) return NodeInformation( node_id=node_id, node_type=node_type, info=self.linking_service.streaming_app_info + info, ) if node_type in self.linking_service.sink_source_info: return NodeInformation( node_id=node_id, node_type=NodeTypesEnum.SINK_SOURCE, info=self.linking_service.sink_source_info[node_type], ) def get_link(self, node_id: str, link_type: Optional[str]): node_type = self.data_flow.get_node_type(node_id) if node_type == NodeTypesEnum.CONNECTOR: config = KafkaConnect.get_connector_config(node_id) return self.linking_service.get_redirect_connector( config, link_type) if node_type == NodeTypesEnum.TOPIC or node_type == NodeTypesEnum.ERROR_TOPIC: return self.linking_service.get_redirect_topic(node_id, link_type) if node_type == NodeTypesEnum.STREAMING_APP: return self.linking_service.get_redirect_streaming_app( self.applications[node_id], link_type) if node_type in self.linking_service.sink_source_redirects: return self.linking_service.get_sink_source_redirects( node_type, node_id) def __setup_k8s_environment(self): try: if settings.k8s.deployment.cluster: logger.info("Setup K8s environment in cluster") kubernetes.config.load_incluster_config() else: logger.info("Setup K8s environment") kubernetes.config.load_kube_config(context=self.context) except kubernetes.config.ConfigException: raise Exception("Could not load K8s environment configuration") self.k8s_app_client = kubernetes.client.AppsV1Api() self.k8s_batch_client = kubernetes.client.BatchV1beta1Api() def __retrieve_deployments(self): items = self.get_deployments() + self.get_stateful_sets() for item in items: try: app = K8sApp.factory(item) if app.is_streams_bootstrap_app(): self.applications[app.name] = app except Exception as e: logger.debug(e) def get_deployments(self) -> List[V1Deployment]: deployments: List[V1Deployment] = [] for namespace in self.namespaces: logger.info(f"List deployments in namespace {namespace}") deployments += self.k8s_app_client.list_namespaced_deployment( namespace=namespace, watch=False).items return deployments def get_stateful_sets(self) -> List[V1StatefulSet]: stateful_sets: List[V1StatefulSet] = [] for namespace in self.namespaces: logger.info(f"List statefulsets in namespace {namespace}") stateful_sets += self.k8s_app_client.list_namespaced_stateful_set( namespace=namespace, watch=False).items return stateful_sets def __retrieve_cron_jobs(self): logger.info("Retrieve cronjob descriptions") cron_jobs = self.get_cron_jobs() for cron_job in cron_jobs: app: Optional[K8sApp] = extractor_container.on_cron_job(cron_job) if app: self.applications[app.name] = app def get_cron_jobs(self) -> List[V1beta1CronJob]: cron_jobs: List[V1beta1CronJob] = [] for namespace in self.namespaces: logger.info(f"List cronjobs in namespace {namespace}") cron_jobs += self.k8s_batch_client.list_namespaced_cron_job( namespace=namespace, watch=False).items return cron_jobs def __get_connectors(self): logger.info("Retrieve Kafka connectors") self.kafka_connectors = KafkaConnect.connectors() def __create_graph(self): logger.info("Setup pipeline graph") for _, app in self.applications.items(): self.data_flow.add_streaming_app(app) for connector in self.kafka_connectors: self.data_flow.add_connector(connector) sources, sinks = extractor_container.get_sources_sinks() for source in sources: self.data_flow.add_source(source) for sink in sinks: self.data_flow.add_sink(sink) # extract subgraphs logger.info("Extract independent pipelines") self.data_flow.extract_independent_pipelines()
def test_multiple_pipelines_sink_source(self, df: DataFlowGraph): df.add_streaming_app( K8sApp.factory( get_streaming_app_deployment( name="test-app1", input_topics="input-topic1", error_topic="error-topic1", output_topic="output-topic1", pipeline="pipeline1", ) ) ) df.add_streaming_app( K8sApp.factory( get_streaming_app_deployment( name="test-app2", input_topics="input-topic2", error_topic="error-topic2", output_topic="output-topic2", pipeline="pipeline2", ) ) ) assert len(df.pipelines) == 2 assert "pipeline1" in df.pipelines assert "pipeline2" in df.pipelines pipeline1 = df.pipelines["pipeline1"] pipeline2 = df.pipelines["pipeline2"] assert set(pipeline1.nodes) == { "test-app1", "input-topic1", "output-topic1", "error-topic1", } assert set(pipeline2.nodes) == { "test-app2", "input-topic2", "output-topic2", "error-topic2", } sink_connector = KafkaConnector( name="test-sink-connector", type=KafkaConnectorTypesEnum.SINK, topics=["output-topic1", "output-topic2"], config={}, ) df.add_connector(sink_connector) assert "test-sink-connector" in df.graph.nodes assert "test-sink-connector" in pipeline1.nodes assert "test-sink-connector" in pipeline2.nodes df.add_sink(Sink("test-sink", "test-sink-connector")) assert "test-sink" in df.graph.nodes assert "test-sink" in pipeline1.nodes assert "test-sink" in pipeline2.nodes source_connector = KafkaConnector( name="test-source-connector", type=KafkaConnectorTypesEnum.SOURCE, topics=["input-topic1", "input-topic2"], config={}, ) df.add_connector(source_connector) assert "test-source-connector" in df.graph.nodes assert "test-source-connector" in pipeline1.nodes assert "test-source-connector" in pipeline2.nodes df.add_source(Source("test-source", "test-source-connector")) assert "test-source" in df.graph.nodes assert "test-source" in pipeline1.nodes assert "test-source" in pipeline2.nodes unrelated_sink_connector = KafkaConnector( name="unrelated-sink-connector", type=KafkaConnectorTypesEnum.SINK, topics=["input-topic1"], config={}, ) df.add_connector(unrelated_sink_connector) assert "unrelated-sink-connector" in df.graph.nodes assert "unrelated-sink-connector" not in pipeline1.nodes assert "unrelated-sink-connector" not in pipeline2.nodes unrelated_source_connector = KafkaConnector( name="unrelated-source-connector", type=KafkaConnectorTypesEnum.SOURCE, topics=["output-topic1"], config={}, ) df.add_connector(unrelated_source_connector) assert "unrelated-source-connector" in df.graph.nodes assert "unrelated-source-connector" not in pipeline1.nodes assert "unrelated-source-connector" not in pipeline2.nodes
def test_verify_connector_exists_in_pipeline(self, df: DataFlowGraph): """Verify that connector exists in specific pipeline before adding it from a sink or source.""" df.add_streaming_app( K8sApp.factory( get_streaming_app_deployment( name="test-app1", input_topics="input-topic1", error_topic=None, output_topic="output-topic1", pipeline="pipeline1", ) ) ) df.add_streaming_app( K8sApp.factory( get_streaming_app_deployment( name="test-app2", input_topics="input-topic2", error_topic=None, output_topic="output-topic2", pipeline="pipeline2", ) ) ) sink_connector1 = KafkaConnector( name="sink-connector1", type=KafkaConnectorTypesEnum.SINK, topics=["output-topic1"], config={}, ) df.add_connector(sink_connector1) sink_connector2 = KafkaConnector( name="sink-connector2", type=KafkaConnectorTypesEnum.SINK, topics=["output-topic2"], config={}, ) df.add_connector(sink_connector2) sink = Sink( name="test-sink", node_type="test-type", source="sink-connector1", ) df.add_sink(sink) sink.source = "sink-connector2" df.add_sink(sink) assert len(df.pipelines) == 2 assert "pipeline1" in df.pipelines assert "pipeline2" in df.pipelines pipeline1 = df.pipelines["pipeline1"] pipeline2 = df.pipelines["pipeline2"] assert "sink-connector1" in pipeline1.nodes assert "sink-connector1" not in pipeline2.nodes assert "sink-connector2" in pipeline2.nodes assert "sink-connector2" not in pipeline1.nodes assert "test-sink" in pipeline1.nodes assert "test-sink" in pipeline2.nodes assert set(pipeline1.nodes) == { "test-app1", "input-topic1", "output-topic1", "sink-connector1", "test-sink", } assert set(pipeline2.nodes) == { "test-app2", "input-topic2", "output-topic2", "sink-connector2", "test-sink", }
def test_get_node_type(self, df: DataFlowGraph): df.add_streaming_app(self.get_k8s_app()) assert df.get_node_type("test-app") == "streaming-app"
def df(self) -> DataFlowGraph: return DataFlowGraph(metric_provider=MetricProvider)
def test_get_node_type(self, df: DataFlowGraph): df.add_streaming_app(K8sApp.factory(get_streaming_app_deployment())) assert df.get_node_type("test-app") == "streaming-app"