def split(self, source_file, target_file_pattern, paths_to_files): ''' function that saves selected parts of a xml file into different files :param source_file: original xml file :param target_file_pattern: patter of the target files. The target files will be named as following: <target_file_pattern><filenumber>.xml :param paths_to_files: list of paths that select a part of the original document and the filenumbers where they will be saved :return: list of paths to splitted files ''' self.path_to_files = paths_to_files # assert that input is a list of file numbers for path in paths_to_files: assert isinstance(paths_to_files[path], set) for filenum in paths_to_files[path]: assert isinstance(filenum, int) # filenum to full file path for path in paths_to_files: paths_to_files[path] = set([ "%s.%s.xml" % (target_file_pattern, x) for x in paths_to_files[path] ]) # self.register_write_nodes([x for x in self.path_to_files]) self.walk_tree(file_path=source_file) self.post_actions() self.close() Logger.info("splitting %s completed" % source_file) return self.path_to_files
def check_for_references(self, **kwargs): """ FastXMLWalker callback function Checks all attributes of the current element whether it occurs in the list of IDs If an ID has been found the element and all parent elements are searched for an attribute named ID if such an element has been found a reference from the ID to the ID found in the attribute is created :param kwargs: FastXMLWalker kwargs :return: Mone """ element = kwargs["element"] for attrib in [element.attrib[x] for x in element.attrib if x != "ID"]: if attrib in self.IDs: # find parent tmp_elem = element target_id = attrib source_id = None while True: if "ID" in tmp_elem.attrib: source_id = tmp_elem.attrib["ID"] break if tmp_elem.getparent() is None: break tmp_elem = tmp_elem.getparent() if not source_id == target_id: self.Refs[source_id].add(target_id) if len(self.Refs) % 10000 == 0: Logger.info("%s Refs" % len(self.Refs))
def search_stepxml(self, myfile, split_path_node_size_tuples=None): """ Search for connected sets in a StepXML file :param myfile: Path to StepXML file :param split_path_node_size_tuples: list of paths to splitnode IDs (e.g.: ["//{http://www.stibosystems.com/step}Product/@ID"]) :return: list of connected sets (list of IDs) """ fx = FastXMLCallbackWalker() interests = { Interest( interest="//@ID", callback=self.add_id ) } if not split_path_node_size_tuples: split_path_node_size_tuples = [("//{http://www.stibosystems.com/step}Product/@ID", 10)] for splitnode_path, node_restriction in split_path_node_size_tuples: interests.add(SplitPath(interest=splitnode_path, callback=self.add_split_node, node_restriction=node_restriction)) fx.register_interests( interests ) fx.walk_tree(myfile) Logger.debug("IDs: %s" % self.IDs) Logger.debug("split_nodes: %s" %self.SplitNodes) Logger.info("%S IDs found, %s SplitNodes found" % (len(self.IDs, len(self.SplitNodes)))) fx2 = FastXMLCallbackWalker() fx2.register_event_callback("start", self.check_for_references) fx2.walk_tree(myfile) Logger.debug("direct: %s" % self.Refs) Logger.info("%s direct dependencies found" % (len(self.Refs))) idr = IndirectIDResovler(self.Refs, self.SplitNodes) idr.resolve_indirect() Logger.debug("indirect: %s" % idr.refs) Logger.info("%s indirect dependencies found" % (len(idr.refs))) connected_sets = self.calc_connected_sets(self.Refs, self.SplitNodes) Logger.debug("connected_sets: %s" % connected_sets) Logger.info("connected sets calculation completed") connected_sets2 = {} for path in connected_sets: connected_set2 = [] for connected_set_path in connected_sets[path]: my_set = set() for item in connected_set_path: for exactpath in self.IDs2Exact[item]: my_set.add(exactpath) connected_set2.append(my_set) connected_sets2[path] = connected_set2 nd = NodeDistributor(connected_sets2) distribution_to_files = nd.distribute() Logger.debug("distribution to files: %s" % distribution_to_files) Logger.info("distribution to files completed") return distribution_to_files
def add_split_node(self, **kwargs): """ FastXMLWalker callback function Should be fired on split nodes To split nodes (self.SplitNodes[<interest>]) the exact path of this node (e.g: /*[0]/*[0](*[0]) is added :param kwargs: FastXMLWalker kwargs :return: None """ #self.SplitNodes[kwargs["interest"]].append(kwargs["walker"].exact_path) self.SplitNodes[kwargs["interest"]].append(kwargs["walker"].exact_path) if sum([len(self.SplitNodes[x]) for x in self.SplitNodes]) % 10000 == 0: Logger.info("%s SplitNodes" % sum([len(self.SplitNodes[x]) for x in self.SplitNodes]))
def add_id(self, **kwargs): """ FastXMLWalker callback function Should fire on a node with an ID The found ID is added to the set of IDs :param kwargs: FastXMLWalker kwargs :return: None """ found_id = kwargs["element"].attrib["ID"] if not found_id in self.IDs: self.IDs.add(found_id) if len(self.IDs) % 10000 == 0: Logger.info("%s IDs" % len(self.IDs))
def add_split_node(self, **kwargs): """ FastXMLWalker callback function Add a found split nodes: This callback should fire on a splitnode, it will then add the node to a list of splitnodes :param kwargs: FastXMLWalker kwargs :return: None """ ID = kwargs["element"].attrib["ID"] self.SplitNodes[kwargs['interest']].append(ID) walker = kwargs["walker"] self.IDs2Exact[ID].add(walker.exact_path) if sum([len(self.SplitNodes[x]) for x in self.SplitNodes]) % 10000 == 0: Logger.info("%s SplitNodes identified" % sum([len(self.SplitNodes[x]) for x in self.SplitNodes]))
def add_split_node_id(self, **kwargs): """ FastXMLWalker callback function Should be fired on the id node of a split node :param kwargs: FastXMLWalker kwargs :return: None """ found_id = kwargs["element"].text self.add_id(**kwargs) self.ExactPathIDs2SplitNodes[kwargs["walker"].exact_path.rsplit("/", 1)[0]].append(found_id) if found_id not in self.IDs: self.IDs.add(found_id) if len(self.IDs) % 10000 == 0: Logger.info("%s IDs identified" % len(self.IDs))
def add_id(self, **kwargs): """ FastXMLWalker callback function Should be fired on nodes that contain an ID Saves the id to self.IDs :param kwargs: FastXMLWalker kwargs :return: None """ found_id = kwargs["element"].text exact_path_parent = kwargs["walker"].exact_path.rsplit("/", 1)[0] self.IDs2ExactPaths[found_id].add(exact_path_parent) if found_id not in self.IDs: self.IDs.add(found_id) if len(self.IDs) % 10000 == 0: Logger.info("%s IDs" % len(self.IDs))
def search_genif2(self, genif_file, split_path_node_restriction_tuples): """ Search connected sets in genif2 files :param genif_file: path to a genif2 file :return: list of sets of exact paths (each list entry should be written in a different file) """ # root=None fx = FastXMLCallbackWalker() if not split_path_node_restriction_tuples: split_path_node_restriction_tuples = [ ("/{http://www.media-saturn.com/msx}data/{http://www.media-saturn.com/msx}item", 1), ("/{http://www.media-saturn.com/msx}data/{http://www.media-saturn.com/msx}asset", 2), ] interests = {Interest( interest="/{http://www.media-saturn.com/msx}data/{http://www.media-saturn.com/msx}relation/{http://www.media-saturn.com/msx}source/{http://www.media-saturn.com/msx}uniqueID", callback=self.add_source ), Interest( interest="/{http://www.media-saturn.com/msx}data/{http://www.media-saturn.com/msx}relation/{http://www.media-saturn.com/msx}target/{http://www.media-saturn.com/msx}uniqueID", callback=self.add_target ), Interest( interest="//{http://www.media-saturn.com/msx}relation", callback=self.relation_to_ref, event='end' ), Interest( interest="//{http://www.media-saturn.com/msx}uniqueID", callback=self.add_id )} for split_path_node_restriction_tuple in split_path_node_restriction_tuples: interests.add( SplitPath( interest=split_path_node_restriction_tuple[0], callback=self.add_split_node, node_restriction=split_path_node_restriction_tuple[1] ) ) interests.add( Interest( interest="%s/{http://www.media-saturn.com/msx}uniqueID" % split_path_node_restriction_tuple[0], callback=self.add_split_node_id, ) ) fx.register_interests( interests ) for _uuid in fx._relative_interests_trees: Logger.debug(RenderTree(fx._relative_interests_trees[_uuid].interest_tree)) fx.walk_tree(genif_file) Logger.debug("ids: %s" % self.IDs) Logger.debug("split_nodes: %s" % self.SplitNodes) Logger.info("%s IDs, %s split nodes, %s direct references identified" % (len(self.IDs), sum([len(self.SplitNodes[x]) for x in self.SplitNodes]), len(self.Refs))) Logger.debug("IDsSplitNodes %s" % self.ExactPathIDs2SplitNodes) Logger.debug("direct: %s" % self.Refs) self.calc_splitnode_ids() idr = IndirectIDResovler(self.Refs, self.SplitNodes) idr.resolve_indirect() Logger.debug("indirect: %s" % idr.refs) Logger.info("indirect reference calculation completed") connected_sets = self.calc_connected_sets(self.SplitNodes) Logger.debug("connected_sets: %s" % connected_sets) Logger.info("connected set calculation completed") nd=NodeDistributor(connected_sets) distribution_to_files = nd.distribute() Logger.debug("distribution to files: %s" % distribution_to_files) Logger.info("distribution to files completed") return distribution_to_files