def _split(inputfile, outputdir): source = open(inputfile, 'r') html = source.read() source.close() if not os.path.isdir(outputdir): os.mkdir(outputdir) idx_slide=0 idx_section=0 parsed = PyQuery(html) for section in parsed('section'): slide = PyQuery(section) if slide.has_class('stack'): idx_section+=1 stack_path = os.path.join(outputdir,'%02d' % idx_section ) os.mkdir(stack_path) for sub_slide in PyQuery(slide.html())('section'): idx_slide+=1 _dump_slide(sub_slide, idx_slide, stack_path) else: if not slide.parent().has_class('stack'): idx_slide+=1 _dump_slide(slide, idx_slide, outputdir)
def find_ideal_tables(self, tables): try: from pyquery import PyQuery except: print >>sys.stderr, "could not import pyquery" return [] rm = [] for table in tables: found = False for t2 in tables: if table == t2: continue t2 = PyQuery(t2) _t = PyQuery(table) while len(_t): if _t == t2: found = True break _t = _t.parent() if found: rm.append(table) ret = [table for table in tables if table not in rm] return ret