def getcss(doc): def fullsplit(csstring): for classname, cmds in cssplit1(csstring): subcss = dict() for cmd, value in cssplit2(cmds): subcss[cmd] = value cssdict[classname] = subcss import urllib cssdict, count = dict(), 1 for tag in tagsinList(doc, ['style', 'link']): if isname(tag, 'link'): if attget(tag, 'rel') == 'stylesheet': url = attget(tag, 'href') if url.find('http:') != -1: style = urllib.urlopen(url).read() else: style = open(urllib.url2pathname(url)).read() fullsplit(style) elif isname(tag, 'style'): for i in tag.childNodes: if istext(i) or iscomment(i): fullsplit(i.data) for tag in tagsbyAttr(doc, 'style'): subcss = dict() for cmd, value in cssplit2(attget(tag, 'style')): subcss[cmd] = value if subcss not in cssdict.values(): cssdict[''.join(['.ecss', str(count)])] = subcss count += 1 return cssdict
def fixspans(doc): tables = list(tags(doc, htmlns, 'table')) for table in tables: trcount, tdcount = 0, 0 for tr in table.childNodes: if tr.localName == 'tr': trcount += 1 for tr in table.childNodes: count = 0 for td in tr.childNodes: if td.localName == 'td': count += 1 if count > tdcount: tdcount = count for tr in table.childNodes: for td in tr.childNodes: if iselement(td): if hasatt(td, 'rowspan'): if int(attget(td, 'rowspan')) > trcount: attset(td, 'rowspan', str(trcount)) elif hasatt(td, 'colspan'): if int(attget(td, 'colspan')) > tdcount: attset(td, 'colspan', str(tdcount)) grid, temp = [], [] for tr in table.childNodes: if iselement(tr) and isname(tr, 'tr'): if len(temp): grid.append(temp) temp = [] for td in tr.childNodes: if iselement(td) and isname(td, 'td'): temp.append(td) if len(temp): grid.append(temp) rowspans = list(tagsbyAttr(table, 'rowspan')) if len(rowspans): for td in rowspans: trs, sibs = [], ['td.parentNode'] for x in range(int(attget(td, 'rowspan'))-1): sibs.append('.nextSibling') if eval(''.join(sibs)): trs.append(eval(''.join(sibs))) for nodelist in grid: for i in nodelist: if td in nodelist: loc = nodelist.index(td) for tr in trs: if tr.firstChild in nodelist: nodelist.insert(loc, td) for nodelist in grid: for td in nodelist: if hasatt(td, 'colspan'): index = nodelist.index(td) colspan = int(attget(td, 'colspan')) offset, largest = len(nodelist[:index]), 0 for nlist in grid: actual = len(nlist[index:-offset]) if colspan > actual: if actual > largest: count = 0 for i in nlist[index:-offset]: if hasatt(i, 'colspan'): count += int(attget(i, 'colspan')) if count != colspan: largest = actual if largest: attset(td, 'colspan', str(largest)) if int(attget(td, 'colspan')) <= 1: attdel(td, 'colspan')
def htmlgrid(doc, structure): grid, temp, blist = [], [], [] for node in structure: if isname(node, 'tr'): if len(temp): if temp[0].parentNode in grid[-1]: if temp not in grid: grid.append(temp) grid.append([node]) else: grid.append([node]) if temp not in grid: grid.append(temp) else: grid.append([node]) temp = [] elif isname(node, 'td'): if node not in blist: temp.append(node) blist.append(node) sibs = ['node.nextSibling'] for x in range(len(node.parentNode.childNodes)-1): neval = ''.join(sibs) if eval(neval): if eval('.'.join([neval, 'localName'])) == 'td': if eval(neval) not in blist: if eval(neval) in structure: temp.append(eval(neval)) blist.append(eval(neval)) sibs.append('.nextSibling') elif isname(node, 'table'): if len(temp): if temp not in grid: grid.append(temp) grid.append([node]) if len(temp): grid.append(temp) rowspans = list(tagsbyAttr(doc, 'rowspan')) if len(rowspans): for td in rowspans: trs, sibs = [], ['td.parentNode'] for x in range(int(attget(td, 'rowspan'))-1): sibs.append('.nextSibling') if eval(''.join(sibs)): trs.append(eval(''.join(sibs))) for nodelist in grid: for tr in trs: if tr.firstChild in nodelist: nodelist.append(td) return grid