def _return_tree_splitted_in_files(self): """ Steps: - Look for observations node - Remove it from the tree, but store it - remove the slices node and substitute it for an empty one - getchildren of observations_node and, for i in len(getchildren) -pop an element (an observation) from the list - put this element in a new observation node - when reaching max_allowed_observations, put this observation node in the global tree. - Serialize it - Save the path of serialization and add it to the result - Do something similar with the slices_node (with an empty observations_node) In this case, no need to control number of slices. Just a file containing only slices """ result = [] #Getting original obs and slices original_obs_node = self._get_observations_node_of_a_tree(self._tree) every_obs = original_obs_node.getchildren() original_sli_node = self._get_slices_node_of_a_tree(self._tree) #Removing obs and slices from the original tree self._remove_slices_and_obs_from_the_original_tree() #putting groups of obs in the tree and serializing original_length = len(every_obs) temporal_observations_node = Element(ModelToXMLTransformer.OBSERVATIONS) for i in range(1, original_length + 1): if i % self._MAX_OBSERVATIONS_ALLOWED == 0: # cycle of _MAX_OBSERVATIONS_ALLOWED: serialize and new node result.append(self._persist_tree_with_obs_node(temporal_observations_node)) temporal_observations_node = Element(ModelToXMLTransformer.OBSERVATIONS) temporal_observations_node.append(every_obs.pop()) if len(temporal_observations_node.getchildren()): # Out of the for loop, but we may have obs to include yet result.append(self._persist_tree_with_obs_node(temporal_observations_node)) #managing slices: if len(original_sli_node.getchildren()) == 0: return result # No more to do. The original tree hadn't got slices. else: result.append(self._persist_tree_with_sli_node(original_sli_node)) #No more to do but returning result. We could restore the original tree object, but there is no reason to do it. return result
class ParseTreeBuilder(object): '''This class supplies an alternative for xml.etree.ElementTree.TreeBuilder which cleans up the tree on the fly while building it. The main use is to normalize the tree that is produced by the editor widget, but it can also be used on other "dirty" interfaces. This builder takes care of the following issues: - Inline tags ('emphasis', 'strong', 'h', etc.) can not span multiple lines - Tags can not contain only whitespace - Tags can not be empty (with the exception of the 'img' tag) - There should be an empty line before each 'h', 'p' or 'pre' (with the exception of the first tag in the tree) - The 'p' and 'pre' elements should always end with a newline ('\\n') - Each 'p', 'pre' and 'h' should be postfixed with a newline ('\\n') (as a results 'p' and 'pre' are followed by an empty line, the 'h' does not end in a newline itself, so it is different) - Newlines ('\\n') after a <li> alement are removed (optional) - The element '_ignore_' is silently ignored ''' def __init__(self, remove_newlines_after_li=True): assert remove_newlines_after_li, 'TODO' self._stack = [] # stack of elements for open tags self._last = None # last element opened or closed self._data = [] # buffer with data self._tail = False # True if we are after an end tag self._seen_eol = 2 # track line ends on flushed data # starts with "2" so check is ok for first top level element def start(self, tag, attrib=None): if tag == '_ignore_': return self._last elif tag == 'h': self._flush(need_eol=2) elif tag in ('p', 'pre'): self._flush(need_eol=1) else: self._flush() #~ print 'START', tag if tag == 'h': if not (attrib and 'level' in attrib): logger.warn('Missing "level" attribute for heading') attrib = attrib or {} attrib['level'] = 1 elif tag == 'link': if not (attrib and 'href' in attrib): logger.warn('Missing "href" attribute for link') attrib = attrib or {} attrib['href'] = "404" # TODO check other mandatory properties ! if attrib: self._last = Element(tag, attrib) else: self._last = Element(tag) if self._stack: self._stack[-1].append(self._last) else: assert tag == 'zim-tree', 'root element needs to be "zim-tree"' self._stack.append(self._last) self._tail = False return self._last def end(self, tag): if tag == '_ignore_': return None elif tag in ('p', 'pre'): self._flush(need_eol=1) else: self._flush() #~ print 'END', tag self._last = self._stack[-1] assert self._last.tag == tag, \ "end tag mismatch (expected %s, got %s)" % (self._last.tag, tag) self._tail = True if len(self._stack) > 1 and not (tag == 'img' or (self._last.text and not self._last.text.isspace()) or self._last.getchildren() ): # purge empty tags if self._last.text and self._last.text.isspace(): self._append_to_previous(self._last.text) empty = self._stack.pop() self._stack[-1].remove(empty) children = self._stack[-1].getchildren() if children: self._last = children[-1] if not self._last.tail is None: self._data = [self._last.tail] self._last.tail = None else: self._last = self._stack[-1] self._tail = False if not self._last.text is None: self._data = [self._last.text] self._last.text = None return empty else: return self._stack.pop() def data(self, text): assert isinstance(text, basestring) self._data.append(text) def _flush(self, need_eol=0): # need_eol makes sure previous data ends with \n #~ print 'DATA:', self._data text = ''.join(self._data) # Fix trailing newlines if text: m = count_eol_re.search(text) if m: self._seen_eol = len(m.group(0)) else: self._seen_eol = 0 if need_eol > self._seen_eol: text += '\n' * (need_eol - self._seen_eol) self._seen_eol = need_eol # Fix prefix newlines if self._tail and self._last.tag in ('h', 'p') \ and not text.startswith('\n'): if text: text = '\n' + text else: text = '\n' self._seen_eol = 1 elif self._tail and self._last.tag == 'li' \ and text.startswith('\n'): text = text[1:] if not text.strip('\n'): self._seen_eol -=1 if text: assert not self._last is None, 'data seen before root element' self._data = [] # Tags that are not allowed to have newlines if not self._tail and self._last.tag in ( 'h', 'emphasis', 'strong', 'mark', 'strike', 'code'): # assume no nested tags in these types ... if self._seen_eol: text = text.rstrip('\n') self._data.append('\n' * self._seen_eol) self._seen_eol = 0 lines = text.split('\n') for line in lines[:-1]: assert self._last.text is None, "internal error (text)" assert self._last.tail is None, "internal error (tail)" if line and not line.isspace(): self._last.text = line self._last.tail = '\n' attrib = self._last.attrib.copy() self._last = Element(self._last.tag, attrib) self._stack[-2].append(self._last) self._stack[-1] = self._last else: self._append_to_previous(line + '\n') assert self._last.text is None, "internal error (text)" self._last.text = lines[-1] else: # TODO split paragraphs if self._tail: assert self._last.tail is None, "internal error (tail)" self._last.tail = text else: assert self._last.text is None, "internal error (text)" self._last.text = text else: self._data = [] def close(self): assert len(self._stack) == 0, 'missing end tags' assert not self._last is None and self._last.tag == 'zim-tree', 'missing root element' return self._last def _append_to_previous(self, text): '''Add text before current element''' parent = self._stack[-2] children = parent.getchildren()[:-1] if children: if children[-1].tail: children[-1].tail = children[-1].tail + text else: children[-1].tail = text else: if parent.text: parent.text = parent.text + text else: parent.text = text