def PageListParser(url): resp, cont = httplib21.request(url) if resp.fromcache: return nothing.Nothing() if resp.info().getmaintype() != 'text': raise NotImplementedError('we expect a text format, not the %s' % resp.info().gettype()) charset = resp.info().getparam('charset') # or None docRoot = BS(cont, from_encoding=charset) return digest(parse(docRoot)[1], resp.geturl())
def PageContentParser(url): resp, cont = httplib21.request(url) if resp.info().getmaintype() == 'text': return ArticleExtractor(resp, cont) elif resp.info().gettype() == 'application/vnd.ms-excel': return excelParser(cont) elif resp.info().gettype() == 'application/msword': getMainContent = lambda self : u'再等等,或许下辈子我会看懂<a target="_blank" href="http://download.microsoft.com/download/0/B/E/0BE8BDD7-E5E8-422A-ABFD-4342ED7AD886/Word97-2007BinaryFileFormat%28doc%29Specification.pdf">word的格式</a>。' getTitlePrefix = lambda self : u'[DOC]' return type('MsWord', (), {'getMainContent':getMainContent, 'getTitlePrefix':getTitlePrefix})() else: raise TypeError('I have no idea how the %s is formatted' % resp.info().gettype())
def calcArea(self): minpix = 55 # mind the side-bar pics height = self.bs_node.get('height', Nothing()).strip().rstrip('px') width = self.bs_node.get('width', Nothing()).strip().rstrip('px') # if you use percentage in height or width, # in most cases it cannot be the main-content if height.endswith('%') or width.endswith('%'): return 0 try: height = int(height) except: height = 0 try: width = int(width) except: width = 0 if 0<height<=minpix or 0<width<=minpix: return 0 if not (height and width): fp = cStringIO.StringIO() try: r, c = httplib21.request(self.bs_node['src']) fp.write(c) fp.seek(0) w, h = Image.open(fp).size except: h = w = 1.0 finally: hdw = h/float(w) # we need float here if not (height or width): height, width = h, w # no need to convert elif not height: height = int(hdw*width) else: width = int(hdw*height) fp.close() if height<=minpix or width<=minpix: return 0 return width*height