def _table2divgroups(elem, table_elem, specmap, omit_whitespace=True): import copy from lxml import html assert table_elem.tag == 'table', table_elem.tag cells = cell_lookup(table_elem) groups = [] for spec in specmap: groupid = spec.idname classvalue = 'mwu-elem-table2divgroups-group' if spec.classname is not None: classvalue += ' ' + spec.classname group_elem = htmlelem(attrib={ 'class' : classvalue, 'id' : groupid, }) assert spec.rowend >= spec.rowstart assert spec.colend >= spec.colstart wrap_rows = (spec.colend > spec.colstart) and (spec.rowend > spec.rowstart) # whether to wrap cells from the same TR tag in their own DIV for ii in range(spec.rowstart, spec.rowend+1): cell_elems = [] for jj in range(spec.colstart, spec.colend+1): td_elem = cells.get((ii, jj), None) if td_elem is None: continue # Could be a colspan issue. Just skip over to next found cell if omit_whitespace and elementempty(td_elem): continue # skip over this empty cell cell_elem = copy.deepcopy(td_elem) for k in cell_elem.attrib: del cell_elem.attrib[k] cell_elem.tag = 'div' cell_elems.append(cell_elem) if wrap_rows: append_elem = htmlelem() group_elem.append(append_elem) else: append_elem = group_elem # meaning, we'll just append the contents of cell_elem directly, not wrapping them in a div for cell_elem in cell_elems: append_elem.append(cell_elem) if not elementempty(group_elem): if spec.lastfilter: spec.lastfilter(group_elem) groups.append(group_elem) if table_elem is not None: groups_elem = htmlelem(attrib={'class' : 'mwu-elem-table2divgroups'}) for group_elem in groups: groups_elem.append(group_elem) replace_child(elem, table_elem, groups_elem)
def _table2divs(elem, omit_whitespace, marker_base, wrap_rows): ''' helper for some table-to-div filters ''' from lxml.html import HtmlElement def rcmarker(**kw): return rcmarkerbase(marker_base, **kw) container_elem = htmlelem(attrib={'class' : marker_base}) if 'table' == elem.tag: table_elem = elem else: table_elem = elem.find('.//table') if table_elem is not None: root_elem = rowsparent(table_elem) rows = root_elem.findall('./tr') for rownum, row in enumerate(rows): if wrap_rows: rowcontainer_elem = htmlelem(attrib={'class' : rcmarker(row=rownum)}) cols = row.findall('./td') for colnum, tdelem in enumerate(cols): if omit_whitespace and elementempty(tdelem): continue # skip over this empty cell cell_elem = htmlelem(text=tdelem.text) for colchild in tdelem: cell_elem.append(colchild) markers = [ rcmarker(row=rownum, col=colnum), rcmarker(col=colnum), ] if not wrap_rows: markers.append(rcmarker(row=rownum)) tdelem_classvalue = tdelem.attrib.get('class', '').strip() if len(tdelem_classvalue) > 0: markers.extend('mwu-td-' + c for c in tdelem_classvalue.split()) cell_elem.attrib['class'] = ' '.join(markers) if wrap_rows: rowcontainer_elem.append(cell_elem) else: container_elem.append(cell_elem) if wrap_rows: container_elem.append(rowcontainer_elem) replace_child(elem, table_elem, container_elem)