def parse_book(self, base_dir): self.parse_index(base_dir) (base_dir / "parsed_index.tsv").save(index) self._check_index(index) o_book = oaktree.Leaf('book') prev_depth = 0 prev_node = o_book for num, title, ident in index: depth = len(num) txt = (base_dir / "part" / f'{ident:04d}').read_text() mcp = self.expand_shortcut(txt) o_section = self.parse_section(mcp) o_title = oaktree.Leaf('title').add_text( title if title != '~' else '') sub_lst = o_section.sub o_section.sub = list() o_section.attach(o_title, *sub_lst) curr_node = prev_node.ancestor_lst[depth - 1] curr_node.attach(o_section) prev_depth = depth prev_node = o_section return o_book
def parse_alinea(self, txt, tag='alinea'): """ txt: should not contain any line feed tag: can be something else than 'alinea'. most of the time it will be 'li' """ o_alinea = oaktree.Leaf(tag) res = alinea_ident_rec.search(txt) if res is not None: # if the alinea ident exists, parse it and cut it o_alinea.ident = int(res.group('ident')) txt = txt[:res.start()] txt = txt.rstrip() # the first level atoms have already been parsed prev = None for res in atom_line_rec.finditer(txt): curr = res.start() s = txt[prev:curr] if s.strip(): o_alinea.add_text(s) o_atom = self.parse_atom(res, False) o_alinea.attach(o_atom) prev = res.end() s = txt[prev:None] if s.strip(): o_alinea.add_text(s) return o_alinea
def _get_section(self, *pos, **nam): if 'b' not in nam or 's' not in nam: raise cherrypy.HTTPError(400) if nam['b'] not in self.shelf.book_set: raise cherrypy.HTTPError(404) return self.proxy._get_section(nam['b'], int(nam['s'])) s = int(nam['s']) b = self.shelf[nam['b']] t = (b.base_dir / 'part' / f'{s:04d}').read_text() e = b.expand_shortcut(t) Path(b.base_dir / ".tmp" / f'{s:04d}.expanded.bkt').write_text(e) o = b.parse_section(e) g = oaktree.proxy.braket.BraketProxy() k = oaktree.proxy.braket.BraketProxy(indent='') g.save(o, Path(b.base_dir / ".tmp" / f'{s:04d}.parsed.bkt')) k.save(o, Path(b.base_dir / ".tmp" / f'{s:04d}.parsednoindent.bkt')) h = oaktree.Leaf('tmp') self.to_html5.compose(o, h) g.save(h.sub[0], Path(b.base_dir / ".tmp" / f'{s:04d}.composed.bkt')) k.save(h.sub[0], Path(b.base_dir / ".tmp" / f'{s:04d}.composednoindent.bkt')) f = oaktree.proxy.html5.Html5Proxy(indent='', fragment=True) f.save(h.sub[0], Path(b.base_dir / ".tmp" / f'{s:04d}.result.html')) return f.save(h.sub[0])
def _prep_section(self, book_key, ident): base_dir = self.repo_dir / book_key b = marccup.parser.generic.GenericParser() u = marccup.composer.html5.Html5Composer() t = (base_dir / 'part' / f'{ident:04d}').read_text() o = b.parse(t) g = oaktree.proxy.braket.BraketProxy() k = oaktree.proxy.braket.BraketProxy(indent='') g.save(o, Path(base_dir / ".tmp" / f'{ident:04d}.parsed.bkt')) #k.save(o, Path( base_dir / ".tmp" / f'{ident:04d}.parsednoindent.bkt')) h = oaktree.Leaf('tmp') u.compose(o, h) #g.save(h.sub[0], Path( base_dir / ".tmp" / f'{ident:04d}.composed.bkt')) #k.save(h.sub[0], Path( base_dir / ".tmp" / f'{ident:04d}.composednoindent.bkt')) f = oaktree.proxy.html5.Html5Proxy(indent='', fragment=True) #f.save(h.sub[0], Path( base_dir / ".tmp" / f'{ident:04d}.result.html')) f.save(h.sub[0], Path(base_dir / ".cache" / "part" / f'{ident:04d}')) return f.save(h.sub[0])
def parse_paragraph(self, paragraph_txt): """ txt consists in, either : * some alineas * a single atom * a bullet list """ atom_block_res = atom_block_rec.match(paragraph_txt) if atom_block_res is not None: # the paragraph consists in a sole atom, with possibly an ident return self.parse_atom(atom_block_res, True) # atom = self.atom_map[int(atom_block_res.group('atom_n'))] # if atom.tag == "table" : # o_block = self.parse_table(atom.content) # elif atom.tag == "math" : # # easy, let's do it now # o_block = oaktree.Leaf('math', flag={'block'}).add_text(atom.content[0].strip()) # else : # o_block = self.parse_alinea('|'.join(atom.content), atom.tag) # o_block.flag.add('block') # if atom_block_res.group('ident') is not None : # o_block.ident = atom_block_res.group('ident').strip() # return o_block else: # the paragraph is made of alineas, bullet or normal for alinea_txt in paragraph_txt.splitlines(): # print(alinea_txt) # let's check that is is a not a bullet of numbered list if not bullet_list_rec.match(alinea_txt): # there is one normal alinea inside, not a bullet list break else: # only bullets ! let's parse it return self.parse_list(paragraph_txt) # no bullet o_block = oaktree.Leaf('paragraph') alinea_lst = paragraph_txt.splitlines() # if the last line is a paragraph ident, parse it, and pop it res = paragraph_ident_rec.match(alinea_lst[-1]) if res is not None: o_block.ident = int(res.group('ident')) alinea_lst.pop(-1) for alinea_txt in alinea_lst: o_line = self.parse_alinea(alinea_txt) o_block.attach(o_line) return o_block
def parse_list(self, txt): prev_indent = -1 o_root = None for n, line in enumerate(txt.splitlines()): res = bullet_list_rec.search(line) indent = len(res.group('tabs')) to_grow = False if indent == prev_indent: # same level, nothing to add, o_list should exists pass elif indent == prev_indent + 1: # new indentation level, will create a new ol/ul group to_grow = True elif indent < prev_indent: # reduced indentation level o_list = o_list.parent_n(2 * (prev_indent - indent)) else: # on rattrappe l'indentation si le premier est également indenté ( ce qui n'est pas très standard ) to_grow = True if to_grow: if res.group('marker') == '*': o_list = oaktree.Leaf('ul') elif res.group('marker') == '#': o_list = oaktree.Leaf('ol') else: raise ValueError() if o_root is None: o_root = o_list else: o_alinea.attach(o_list) o_alinea = self.parse_alinea(res.group('line'), 'li') o_list.attach(o_alinea) prev_indent = indent return o_root
def parse_document(self, root_dir) : o_doc = oaktree.Leaf("doc") chapter_lst = [o_doc, None, None, None, None] for indent, section in (root_dir / "__doc__.tsv").load() : indent, section = int(indent), int(section) o_section = chapter_lst[indent - 1].grow('section') chapter_lst[indent] = o_section txt = (root_dir / f"{section}.bkt").read_text() self.parse_section(o_section, txt) return o_doc
def parse_atom(self, atom_res, is_block): # the paragraph consists in a sole atom, with possibly an ident atom = self.atom_map[int(atom_res.group('atom_n'))] if atom.tag == "table": o_block = self._parse_atom_table(atom.content) elif atom.tag == "math": if is_block: o_block = oaktree.Leaf('math', flag={'block'}) else: o_block = oaktree.Leaf('math') o_block.add_text(atom.content[0].strip()) else: o_block = self.parse_alinea('|'.join(atom.content), atom.tag) o_block.flag.add('block') if is_block and atom_res.group('ident') is not None: o_block.ident = int(atom_res.group('ident').strip()) return o_block
def _parse_atom_table(self, txt): o_table = oaktree.Leaf('table') # a bit of cleaning, such as the row separator is really the sole marker on its line txt = '\n'.join( (line.strip() if table_split_rec.match(line) is not None else line) for line in '|'.join(txt).splitlines()) txt = self.protect_atom(txt) if table_split_rec.search(txt) is not None: row_lst = table_split_rec.split(txt) else: row_lst = [line for line in txt.splitlines() if line.strip()] for row in row_lst: o_row = o_table.grow('table_row') for cell in row.split('|'): cell = cell.strip() # look for row or col span clues table_span_res = table_span_rec.match(cell) if table_span_res is not None: row_n = table_span_res.group('row_n') if row_n is not None: o_cell.nam["rspan"] = row_n col_n = table_span_res.group('col_n') if col_n is not None: o_cell.nam["cspan"] = col_n cell = cell[table_span_res.end():] # look for header clue if cell.startswith('='): is_header = True cell = cell[1:].lstrip() else: is_header = False o_cell = o_row.grow('table_cell') o_content = self.parse(cell, True) o_cell.attach(*o_content.sub) if is_header: o_cell.flag.add('header') return o_table
def _parse_section(self, txt, tag='section'): """ a section is a part of text which contains many paragraphs """ initial_txt = txt o_section = oaktree.Leaf(tag) # protect higher level atoms and cleanup txt = self.protect_atom(txt) txt = self.clean_lines(txt) for paragraph_txt in txt.split('\n\n'): o_block = self.parse_paragraph(paragraph_txt) o_section.attach(o_block) self.dbg(f'GenericParser.parse_section.bkt', initial_txt, BraketProxy().save(o_section.root)) return o_section
def mcp_to_html(self, mcp_txt, debug_dir=None): b = marccup.MarccupParser(debug_dir) o_section = b.parse_section(mcp_txt) o_container = oaktree.Leaf('tmp') u = spext.composer.html5.Html5Composer__base__() u.compose(o_section, o_container) f = oaktree.proxy.html5.Html5Proxy(indent='', fragment=True) html_txt = f.save(o_container.sub[0]) if debug_dir: g = oaktree.proxy.braket.BraketProxy() g.save(o_section, debug_dir / '4_parsed.bkt') (debug_dir / "5_composed.html").write_text( html_txt.replace('><', '>\n<')) return html_txt
def parse_page(self, txt): o_page = oaktree.Leaf('page') prev_depth = 0 prev_node = o_page prev_res = None stack = list() for line in txt.splitlines(): title_res = title_rec.match(line) if title_res is None: # push the line on the stack stack.append(line) else: if prev_res is None: # no title previously found, this must be some introductory text o_section = self.parse_section('\n'.join(stack)) prev_node.attach(o_section) else: o_section = self.parse_section('\n'.join(stack), prev_res) depth = len(prev_res.group('depth')) if prev_depth + 1 < depth: raise ValueError curr_node = prev_node.ancestor_lst[depth - 1] curr_node.attach(o_section) prev_depth = depth prev_node = o_section prev_res = title_res stack = [ line, ] return o_page
#!/usr/bin/env python3 import sys from cc_pathlib import Path import oaktree import marccup section_name = sys.argv[1] section_pth = Path("document") / f"{section_name}.bkt" section_txt = section_pth.read_text() u = marccup.Parser(debug_dir=Path("tmp")) p = oaktree.Leaf("section") u.parse_section(p, section_txt) v = marccup.Composer(p, Path("tmp/result.html"))
n = '\n' if self.indent else '' w(i + '<' + ' '.join(s) + ('>' if node.sub else ' />') + n) for k in node.sub: if isinstance(k, oaktree.Leaf): self.compose(k, w, depth + 1) else: w(str(k) + n) if node.sub: w(f'{i}</{t}>{n}') if __name__ == '__main__': import oaktree u = oaktree.Leaf('tutu', nam={ 'vache': "meuh", 'canard': "coincoin" }, style="animal") g = u.grow('toto', flag="eurhm") g.add_text("bizarre") g.grow('tata') g.add_text("vouzavé dit bizarre") x = Html5Proxy() print(x.save(u))
#!/usr/bin/env python3 from cc_pathlib import Path import oaktree from oaktree.proxy.braket import BraketProxy u1 = oaktree.Leaf("one") u11 = u1.grow("one-one") u12 = u1.grow("one-two") u12.add_text("first line") u12.add_text("second line") BraketProxy().save(u1, Path("test.bkt"))