def getcss(doc): def fullsplit(csstring): for classname, cmds in cssplit1(csstring): subcss = dict() for cmd, value in cssplit2(cmds): subcss[cmd] = value cssdict[classname] = subcss import urllib cssdict, count = dict(), 1 for tag in tagsinList(doc, ['style', 'link']): if isname(tag, 'link'): if attget(tag, 'rel') == 'stylesheet': url = attget(tag, 'href') if url.find('http:') != -1: style = urllib.urlopen(url).read() else: style = open(urllib.url2pathname(url)).read() fullsplit(style) elif isname(tag, 'style'): for i in tag.childNodes: if istext(i) or iscomment(i): fullsplit(i.data) for tag in tagsbyAttr(doc, 'style'): subcss = dict() for cmd, value in cssplit2(attget(tag, 'style')): subcss[cmd] = value if subcss not in cssdict.values(): cssdict[''.join(['.ecss', str(count)])] = subcss count += 1 return cssdict
def _getByName(self, childList, name): '''Retrieves child nodes by name Arguments: childlist -- list of specific nodes child -- specific child node''' for i in childList: if attget(i, u'name') == name: return i
def fixspans(doc): tables = list(tags(doc, htmlns, 'table')) for table in tables: trcount, tdcount = 0, 0 for tr in table.childNodes: if tr.localName == 'tr': trcount += 1 for tr in table.childNodes: count = 0 for td in tr.childNodes: if td.localName == 'td': count += 1 if count > tdcount: tdcount = count for tr in table.childNodes: for td in tr.childNodes: if iselement(td): if hasatt(td, 'rowspan'): if int(attget(td, 'rowspan')) > trcount: attset(td, 'rowspan', str(trcount)) elif hasatt(td, 'colspan'): if int(attget(td, 'colspan')) > tdcount: attset(td, 'colspan', str(tdcount)) grid, temp = [], [] for tr in table.childNodes: if iselement(tr) and isname(tr, 'tr'): if len(temp): grid.append(temp) temp = [] for td in tr.childNodes: if iselement(td) and isname(td, 'td'): temp.append(td) if len(temp): grid.append(temp) rowspans = list(tagsbyAttr(table, 'rowspan')) if len(rowspans): for td in rowspans: trs, sibs = [], ['td.parentNode'] for x in range(int(attget(td, 'rowspan'))-1): sibs.append('.nextSibling') if eval(''.join(sibs)): trs.append(eval(''.join(sibs))) for nodelist in grid: for i in nodelist: if td in nodelist: loc = nodelist.index(td) for tr in trs: if tr.firstChild in nodelist: nodelist.insert(loc, td) for nodelist in grid: for td in nodelist: if hasatt(td, 'colspan'): index = nodelist.index(td) colspan = int(attget(td, 'colspan')) offset, largest = len(nodelist[:index]), 0 for nlist in grid: actual = len(nlist[index:-offset]) if colspan > actual: if actual > largest: count = 0 for i in nlist[index:-offset]: if hasatt(i, 'colspan'): count += int(attget(i, 'colspan')) if count != colspan: largest = actual if largest: attset(td, 'colspan', str(largest)) if int(attget(td, 'colspan')) <= 1: attdel(td, 'colspan')
def htmlgrid(doc, structure): grid, temp, blist = [], [], [] for node in structure: if isname(node, 'tr'): if len(temp): if temp[0].parentNode in grid[-1]: if temp not in grid: grid.append(temp) grid.append([node]) else: grid.append([node]) if temp not in grid: grid.append(temp) else: grid.append([node]) temp = [] elif isname(node, 'td'): if node not in blist: temp.append(node) blist.append(node) sibs = ['node.nextSibling'] for x in range(len(node.parentNode.childNodes)-1): neval = ''.join(sibs) if eval(neval): if eval('.'.join([neval, 'localName'])) == 'td': if eval(neval) not in blist: if eval(neval) in structure: temp.append(eval(neval)) blist.append(eval(neval)) sibs.append('.nextSibling') elif isname(node, 'table'): if len(temp): if temp not in grid: grid.append(temp) grid.append([node]) if len(temp): grid.append(temp) rowspans = list(tagsbyAttr(doc, 'rowspan')) if len(rowspans): for td in rowspans: trs, sibs = [], ['td.parentNode'] for x in range(int(attget(td, 'rowspan'))-1): sibs.append('.nextSibling') if eval(''.join(sibs)): trs.append(eval(''.join(sibs))) for nodelist in grid: for tr in trs: if tr.firstChild in nodelist: nodelist.append(td) return grid
def cssifier(doc, **kwargs): def insertcss(attdict): for id in attdict: if id[1] in map: mapping = map.get(id[1]) if mapping: if type(mapping) == types.TupleType: if len(mapping) > 1: for item in mapping: if item.find('%s') != -1: value = attlist.get(id).value try: if int(value): value = ''.join([value, 'px']) except ValueError: pass delete.append(id) yield item % value else: delete.append(id) yield item elif len(mapping) == 1: if mapping not in css: value = tag.attributes.get(id).value css[mapping % value] = 0 delete.append(id) elif type(mapping) == types.DictType: value = mapping.get(tag.attributes.get(id).value) if value: delete.append(id) yield value elif mapping.find('%s') != -1: value = tag.attributes.get(id).value try: if int(value): value = ''.join([value, 'px']) except ValueError: pass delete.append(id) yield mapping % value def recss(cdict): return '; '.join(['%s: %s' % (i, j) for i, j in cdict.iteritems()]) from htmldefs import html2css import types if 'embed' in kwargs: embed = 1 else: embed = None if 'file' in kwargs: file = kwargs['file'] else: file = None css, tags, count = getcss(doc), list(tagsbyNS(htmlns, doc)), 1 for tag in tags: if tag.localName in html2css: delete, map = list(), html2css.get(tag.localName) tcss, scss = list(insertcss(tag.attributes)), dict() if len(delete): for i in delete: del tag.attributes[i] if len(tcss): for cmd, value in cssplit2('; '.join(tcss)): scss[cmd] = value if hasatt(tag, 'class'): scss.update(css.get(''.join(['.', attget(tag, 'class')]))) attdel(tag, 'class') if hasatt(tag, 'style'): style = css.get(getattr(tag, 'style')) for cmd, value in cssplit2(style): scss[cmd] = value attdel(tag, 'style') if scss not in css.values(): classvalue = ''.join(['css', str(count)]) css[''.join(['.', classvalue])] = scss count += 1 elif scss in css.values(): for i, j in css.iteritems(): if j == scss: classvalue = i.strip('.') if embed or file: attset(tag, 'class', classvalue) else: attset(tag, 'style', recss(scss)) if embed or file: cssgather = [('%s {%s}' % (i, recss(css.get(i)))) for i in css] cssgather.sort() cssmaster = '\n'.join(cssgather) for i in tagsinList(doc, ['style', 'link']): if isname(i, 'link'): if hasatt(i, 'rel'): if attget(i, 'rel') == 'stylesheet': i.parentNode.removeChild(i) elif isname(i, 'style'): i.parentNode.removeChild(i) if kwargs: if embed: style = doc.createElementNS(htmlns, 'style') style.appendChild(doc.createComment(cssmaster)) list(tags(doc, htmlns, 'head'))[0].appendChild(style) elif file: import urllib open(file, 'wb').write(cssmaster) style = doc.createElementNS(htmlns, 'link') attset(style, 'rel', 'stylesheet') attset(style, 'href', urllib.pathname2url(file)) attset(style, 'type', 'text/css') list(tags(doc, htmlns, 'head'))[0].appendChild(style)
def getwidth(node): return attget(node, 'width')
def fixwidths(grid): def haswidth(node): return hasatt(node, 'width') def getwidth(node): return attget(node, 'width') def setwidth(node, width): attset(node, 'width', width) def regtest(nodelist): for i in nodelist: if haswidth(i): if getwidth(i).find('%') == -1: return True def element(name): return node.ownerDocument.createElementNS(htmlns, name) maxwidths, count, mixed = [], 0, None for nodelist in grid: maxwidths.append(count) count = 0 for node in nodelist: if haswidth(node): width = getwidth(node) if width.find('%') == -1: count += int(width) if len(maxwidths): maxwidths.sort() maxwidths.reverse() maxwidth = str(maxwidths[0]) setwidth(grid[0][0], maxwidth) else: return None for nodelist in grid[1:]: nowidths, percents, count = [], [], 0 for node in nodelist: pwidth = getwidth(node.parentNode) if pwidth == '': pwidth = maxwidth if len(nodelist) > 1: if haswidth(node): width = getwidth(node) if width.find('%') == -1: count += int(width) elif width.find('%') != -1: percents.append(node) else: nowidths.append(node) else: setwidth(node, pwidth) if len(nodelist) > 1: pwidth = int(pwidth) if count: pwidth -= count if len(nowidths): portion = pwidth / len(nowidths) if len(percents) > 1 and regtest(nodelist): mixed = True for node in percents: width = round((float(getwidth(node).strip('%'))*0.01)*pwidth) attset(node, 'width', str(int(width))) for node in nowidths: setwidth(node, str(portion)) if mixed: tds = [] for node in percents: if node not in tds and isname(node, 'td'): sibs = ['node'] for x in range(len(percents)): sibs.append('.nextSibling') if eval(''.join(sibs)) in percents: if node not in tds: tds.append(node) tds.append(eval(''.join(sibs))) if len(tds): tcount, td = 0, element('td') tds[0].parentNode.insertBefore(td, tds[0]) tr, table = element('tr'), element('table') share, heights = dict(), list() for t in tds: att, delete = dict(), dict() if len(share): for a in t.attributes: attr = t.attributes.get(a) att[attr.name] = attr.value else: for a in t.attributes: attr = t.attributes.get(a) share[attr.name] = attr.value if len(att): for x, y in share.iteritems(): if (x, y) not in att.iteritems(): delete[x] = y for x in delete: del share[x] tcount += int(getwidth(t)) tr.appendChild(t) setwidth(td, str(tcount)) if len(share): for n, v in share.iteritems(): attset(td, n, v) attset(table, 'border', '0') attset(table, 'cellspacing', '0') attset(table, 'cellpadding', '0') setwidth(table, str(tcount)) td.appendChild(table) table.appendChild(tr) nodelist.append(td) for x in nodelist: if hasatt(x, 'height'): h = attget(x, 'height') if h.find('%') == -1: heights.append(h) if len(heights): heights.sort() heights.reverse() max = str(heights[0]) for x in nodelist: attset(x, 'height', max) mixed = None