def test_save_and_load(self): graph_data = GraphData() graph_data.create_index_on_property("qualified_name", "alias") graph_data.add_node({"method"}, {"qualified_name": "ArrayList.add"}) graph_data.add_node({"override method"}, {"qualified_name": "ArrayList.pop"}) graph_data.save("test.graph") graph_data: GraphData = GraphData.load("test.graph") self.assertEqual(graph_data.get_node_num(), 2)
class KGBuildPipeline: def __init__(self): self.__name2component = {} self.__component_order = [] self.__graph_data = GraphData() self.__doc_collection = MultiFieldDocumentCollection() self.__before_run_component_listeners = {} self.__after_run_component_listeners = {} def __repr__(self): return str(self.__component_order) def exist_component(self, component_name): """ check whether the component exist in the Pipeline :param component_name: the name of component :return: True, exist, False, not exist. """ if component_name in self.__component_order: return True return False def add_before_listener(self, component_name, listener: PipelineListener): """ add a new PipelineListener running before a specific component :param component_name: the name of the component :param listener: the PipelineListener :return: """ if not self.exist_component(component_name): raise ComponentNotExistError(component_name) if component_name not in self.__before_run_component_listeners: self.__before_run_component_listeners[component_name] = [] self.__before_run_component_listeners[component_name].append(listener) def add_after_listener(self, component_name, listener: PipelineListener): if component_name not in self.__after_run_component_listeners: self.__after_run_component_listeners[component_name] = [] self.__after_run_component_listeners[component_name].append(listener) def __get_component_order(self, name): """ get the order of the specific component :param name: the specific component :return: the order start from 0 to num(component), -1. the specific component not exist """ self.__component_order.append(name) for order, exist_component in enumerate(self.__component_order): if exist_component == name: return order return -1 def __allocate_order_for_new_component(self, before=None, after=None): """ try to allocate the right position for the new component :param before: the component of this new component must run before :param after: the component of this new component must run after :return: -1, can't not find a right order. """ min_order = 0 max_order = self.num_of_components() if before is not None: max_order = self.__get_component_order(before) if max_order == -1: max_order = self.num_of_components() if after is not None: min_order = self.__get_component_order(after) + 1 if min_order == -1: min_order = 0 if min_order > max_order: return -1 return max_order def add_component(self, name, component: Component, before=None, after=None, **config): """ add a new component to this pipeline with given name. In a pipeline, the component name must be unique. :param after: the component name the this new component must run after :param before: the component name the this new component must run after :param name: the name of this new component :param component: the component instance :param config: the other config, save for update :return: """ order = self.__allocate_order_for_new_component(before=before, after=after) if order == -1: raise ComponentOrderError("Can't not find a right order for %s" % name) component.set_graph_data(self.__graph_data) component.set_doc_collection(self.__doc_collection) self.__name2component[name] = component self.__component_order.insert(order, name) def check(self): """ check whether the components in the pipeline setting correct. e.g., the order of the component is wrong. the necessary component for a component to run is missing. :return: True the pipeline is correct. """ current_entities = self.get_provided_entities() current_relations = self.get_provided_relations() current_document_fields = self.get_provided_document_fields() component_pairs = self.get_component_name_with_component_pair_by_order( ) for component_name, component in component_pairs: missing_entities = component.dependent_entities( ) - current_entities if missing_entities != set(): raise ComponentDependencyError(component_name, missing_entities) current_entities.update(component.provided_entities()) missing_relations = component.dependent_relations( ) - current_relations if missing_entities != set(): raise ComponentDependencyError(component_name, missing_relations) current_relations.update(component.provided_relations()) missing_fields = component.dependent_document_fields( ) - current_document_fields if missing_fields != set(): raise ComponentDependencyError(component_name, missing_fields) current_document_fields.update( component.provided_document_fields()) return True def get_provided_document_fields(self): """ get the provided entity type set for the pipeline from the current DocumentCollection. If the pipeline start from empty state, This method will return empty set :return: """ return set(self.__doc_collection.get_field_set()) def get_provided_relations(self): """ get the provided relation type set for the pipeline from the current GraphData. If the pipeline start from empty state, This method will return empty set :return: """ return self.__graph_data.get_all_relation_types() def get_provided_entities(self): """ get the provided entity type set for the pipeline from the current GraphData. If the pipeline start from empty state, This method will return empty set :return: """ return set(self.__graph_data.get_all_labels()) def get_components_by_order(self): components = [] for component_name in self.__component_order: component: Component = self.__name2component[component_name] components.append(component) return components def get_component_name_with_component_pair_by_order(self): components = [] for component_name in self.__component_order: component: Component = self.__name2component[component_name] components.append((component_name, component)) return components def run(self, **config): self.check() print("start running the pipeline") for component_name in self.__component_order: component: Component = self.__name2component[component_name] self.before_run_component(component_name, **config) component.before_run() component.run() component.after_run() self.after_run_component(component_name, **config) print("finish running the pipeline") def before_run_component(self, component_name, **config): print("start running with name=%r in the pipeline" % component_name) for listener in self.__before_run_component_listeners.get( component_name, []): listener.on_before_run_component(component_name, self, **config) def after_run_component(self, component_name, **config): print("finish running with name=%r in the pipeline\n" % component_name) for listener in self.__after_run_component_listeners.get( component_name, []): listener.on_after_run_component(component_name, self, **config) def save(self, graph_path=None, doc_path=None): """ save the graph data object after all the building of all component :param doc_path: the path to save the DocumentCollection :param graph_path: the path to save the GraphData :return: """ self.save_graph(path=graph_path) self.save_doc(path=doc_path) def save_graph(self, path): if path is None: return self.__graph_data.save(path) def save_doc(self, path): if path is None: return self.__doc_collection.save(path) def load_graph(self, graph_data_path): self.__graph_data = GraphData.load(graph_data_path) # update component graph data for component_name in self.__component_order: component: Component = self.__name2component[component_name] component.set_graph_data(self.__graph_data) print("load graph") def load_doc(self, document_collection_path): self.__doc_collection = MultiFieldDocumentCollection.load( document_collection_path) # update component doc_collection for component_name in self.__component_order: component: Component = self.__name2component[component_name] component.set_doc_collection(self.__doc_collection) print("load doc collection") def num_of_components(self): return len(self.__component_order)