def Items(self): itemsprocessed = [] cnt4debug = 0 opener = URLOpener(self.host) decoder = AutoDecoder() for section, url in self.feeds: content = None cnt4debug += 1 if IsRunInLocal and cnt4debug > 1: break result = opener.open(url) status_code, content = result.status_code, result.content if status_code != 200 and content: logging.error('err(%d) to fetch %s.' % (status_code,url)) continue if self.feed_encoding: content = content.decode(self.feed_encoding) else: content = decoder.decode(content) content = self.preprocess(content) feed = feedparser.parse(content) for e in feed['entries']: # 全文RSS中如果有广告或其他不需要的内容,可以在postprocess去掉 desc = self.postprocess(e.description) desc = self.FragToXhtml(desc, e.title, self.feed_encoding) if self.keep_image: soup = BeautifulSoup(content) self.soupbeforeimage(soup) for img in soup.findAll('img'): imgurl = img['src'] if not imgurl.startswith('http') and not imgurl.startswith('www'): imgurl = self.urljoin(url, imgurl) imgresult = opener.open(imgurl) imgcontent = imgresult.content if imgresult.status_code == 200 else None if imgcontent: imgtype = imghdr.what(None, imgcontent) if imgtype: imgmime = r"image/" + imgtype if imgtype == 'jpeg': fnimg = "%d.jpg" % random.randint(10000,99999999) else: fnimg = "%d.%s" % (random.randint(10000,99999999), imgtype) img['src'] = fnimg yield (imgmime, imgurl, fnimg, imgcontent) self.soupprocessex(soup) desc = soup.renderContents('utf-8').decode('utf-8') soup = None if e.title not in itemsprocessed and desc: itemsprocessed.append(e.title) yield (section, e.link, e.title, desc)
def sanitize_contents(self, contents): soup = BeautifulSoup(contents) for tagname in ['script', 'meta', 'head', 'link']: [tag.extract() for tag in soup.findAll(tagname)] attr_re = re.compile('^on.*', re.I) for tag in soup.findAll(): for attr, _ in tag.attrs: if attr_re.match(attr): del tag[attr] for tag in soup.findAll(attrs={'href': re.compile(r'^\s*javascript:.*', re.I)}): del tag['href'] for tag in soup.findAll(attrs={'src': re.compile(r'^\s*javascript:.*', re.I)}): del tag['src'] sanitized_contents = soup.renderContents() return sanitized_contents
def Items(self): """ 生成器,返回一个元组 对于HTML:section,url,title,content 对于图片,mime,url,filename,content """ cnt4debug = 0 decoder = AutoDecoder() for section, url in self.feeds: cnt4debug += 1 if IsRunInLocal and cnt4debug > 1: break opener = URLOpener(self.host) result = opener.open(url) status_code, content = result.status_code, result.content if status_code != 200 or not content: logging.error('err(%d) to fetch %s.' % (status_code,url)) continue if self.page_encoding: content = content.decode(self.page_encoding) else: content = decoder.decode(content) content = self.preprocess(content) soup = BeautifulSoup(content) try: title = soup.html.head.title.string except AttributeError: logging.error('object soup invalid!(%s)'%url) continue title = self.processtitle(title) if self.keep_only_tags: body = Tag(soup, 'body') try: if isinstance(self.keep_only_tags, dict): self.keep_only_tags = [self.keep_only_tags] for spec in self.keep_only_tags: for tag in soup.find('body').findAll(**spec): body.insert(len(body.contents), tag) soup.find('body').replaceWith(body) except AttributeError: # soup has no body element pass def remove_beyond(tag, next): # 鍐呭祵鍑芥暟 while tag is not None and getattr(tag, 'name', None) != 'body': after = getattr(tag, next) while after is not None: ns = getattr(tag, next) after.extract() after = ns tag = tag.parent if self.remove_tags_after: rt = [self.remove_tags_after] if isinstance(self.remove_tags_after, dict) else self.remove_tags_after for spec in rt: tag = soup.find(**spec) remove_beyond(tag, 'nextSibling') if self.remove_tags_before: tag = soup.find(**self.remove_tags_before) remove_beyond(tag, 'previousSibling') remove_tags = self.insta_remove_tags + self.remove_tags remove_ids = self.insta_remove_ids + self.remove_ids remove_classes = self.insta_remove_classes + self.remove_classes remove_attrs = self.insta_remove_attrs + self.remove_attrs for tag in soup.findAll(remove_tags): tag.extract() for id in remove_ids: for tag in soup.findAll(attrs={"id":id}): tag.extract() for cls in remove_classes: for tag in soup.findAll(attrs={"class":cls}): tag.extract() for attr in remove_attrs: for tag in soup.findAll(attrs={attr:True}): del tag[attr] for tag in soup.findAll(attrs={"type":"text/css"}): tag.extract() for cmt in soup.findAll(text=lambda text:isinstance(text, Comment)): cmt.extract if self.keep_image: self.soupbeforeimage(soup) for img in soup.findAll('img'): imgurl = img['src'] if not imgurl.startswith('http') and not imgurl.startswith('www'): imgurl = self.urljoin(url, imgurl) imgresult = opener.open(imgurl) imgcontent = imgresult.content if imgresult.status_code == 200 else None if imgcontent: imgtype = imghdr.what(None, imgcontent) if imgtype: imgmime = r"image/" + imgtype if imgtype == 'jpeg': fnimg = "%d.jpg" % random.randint(10000,99999999) else: fnimg = "%d.%s" % (random.randint(10000,99999999), imgtype) img['src'] = fnimg yield (imgmime, imgurl, fnimg, imgcontent) else: for img in soup.findAll('img'): img.extract() self.soupprocessex(soup) content = soup.renderContents('utf-8').decode('utf-8') soup = None content = self.postprocess(content) yield (section, url, title, content)
def fulltext(self, url, decoder): #因为图片文件占内存,为了节省内存,这个函数也做为生成器 if self.fulltext_by_instapaper: url = "http://www.instapaper.com/m?u=%s" % self.url_unescape(url) opener = URLOpener(self.host) result = opener.open(url) status_code, content = result.status_code, result.content if status_code != 200 or not content: logging.error('err(%d) to fetch %s.' % (status_code,url)) return if self.page_encoding: content = content.decode(self.page_encoding) else: content = decoder.decode(content) content = self.preprocess(content) soup = BeautifulSoup(content) try: title = soup.html.head.title.string except AttributeError: logging.error('object soup invalid!(%s)'%url) return title = self.processtitle(title) soup.html.head.title.string = title if self.keep_only_tags: body = Tag(soup, 'body') try: if isinstance(self.keep_only_tags, dict): self.keep_only_tags = [self.keep_only_tags] for spec in self.keep_only_tags: for tag in soup.find('body').findAll(**spec): body.insert(len(body.contents), tag) soup.find('body').replaceWith(body) except AttributeError: # soup has no body element pass def remove_beyond(tag, next): # 内联函数 while tag is not None and getattr(tag, 'name', None) != 'body': after = getattr(tag, next) while after is not None: ns = getattr(tag, next) after.extract() after = ns tag = tag.parent if self.remove_tags_after: rt = [self.remove_tags_after] if isinstance(self.remove_tags_after, dict) else self.remove_tags_after for spec in rt: tag = soup.find(**spec) remove_beyond(tag, 'nextSibling') if self.remove_tags_before: tag = soup.find(**self.remove_tags_before) remove_beyond(tag, 'previousSibling') remove_tags = self.insta_remove_tags + self.remove_tags remove_ids = self.insta_remove_ids + self.remove_ids remove_classes = self.insta_remove_classes + self.remove_classes remove_attrs = self.insta_remove_attrs + self.remove_attrs for tag in soup.findAll(remove_tags): tag.extract() for id in remove_ids: for tag in soup.findAll(attrs={"id":id}): tag.extract() for cls in remove_classes: for tag in soup.findAll(attrs={"class":cls}): tag.extract() for attr in remove_attrs: for tag in soup.findAll(attrs={attr:True}): del tag[attr] for tag in soup.findAll(attrs={"type":"text/css"}): tag.extract() for cmt in soup.findAll(text=lambda text:isinstance(text, Comment)): cmt.extract if self.keep_image: self.soupbeforeimage(soup) for img in soup.findAll('img'): imgurl = img['src'] if not imgurl.startswith('http') and not imgurl.startswith('www'): imgurl = self.urljoin(url, imgurl) imgresult = opener.open(imgurl) imgcontent = imgresult.content if imgresult.status_code == 200 else None if imgcontent: imgtype = imghdr.what(None, imgcontent) if imgtype: imgmime = r"image/" + imgtype if imgtype == 'jpeg': fnimg = "%d.jpg" % random.randint(10000,99999999) else: fnimg = "%d.%s" % (random.randint(10000,99999999), imgtype) img['src'] = fnimg yield (imgmime, imgurl, fnimg, imgcontent) else: for img in soup.findAll('img'): img.extract() self.soupprocessex(soup) content = soup.renderContents('utf-8').decode('utf-8') soup = None yield (title, None, None, content)