def POST(self): user = self.getcurrentuser(forAjax=True) web.header('Content-Type', 'application/json') webInput = web.input() category = webInput.get('category', '') title = webInput.get('title') feedUrl = webInput.get("url") isfulltext = bool(webInput.get('isfulltext', '').lower() == 'true') creator = webInput.get('creator', '') if not title or not feedUrl: return json.dumps({'status': _("Title or Url is empty!")}) opener = URLOpener() srvUrl = urlparse.urljoin('http://kindleear.appspot.com/', SharedLibrarykindleearAppspotCom.__url__) data = { 'category': category, 'title': title, 'url': feedUrl, 'creator': creator, 'isfulltext': 'true' if isfulltext else 'false', 'key': 'kindleear.lucky!' } result = opener.open(srvUrl, data) if result.status_code == 200 and result.content: return result.content else: return json.dumps({ 'status': _('Cannot submit data to kindleear.appspot.com, status: %s' % URLOpener.CodeMap(result.status_code)) })
def ParsePageContent(self, topic, url, urls, count): # 请求主题页面链接并获取其内容 result = self.GetResponseContent(url) # 如果请求成功,并且页面内容不为空 if result.status_code == 200 and result.content: # 将页面内容转换成BeatifulSoup对象 soup = BeautifulSoup(result.content, 'lxml') # 找出当前页面文章列表中所有文章条目 items = soup.find_all(name='span', class_='tw3_01_2_t') # 循环处理每个文章条目 for item in items: title = item.a.string # 获取文章标题 link = item.a.get('href') # 获取文章链接 link = BaseFeedBook.urljoin(url, link) # 合成文章链接 count += 1 # 统计当前已处理的文章条目 # 如果处理的文章条目超过了设定数量则中止抽取 if count > self.max_articles_per_feed: break # 如果文章发布日期超出了设定范围则忽略不处理 if self.OutTimeRange(item): continue # 将符合设定文章数量和时间范围的文章信息作为元组加入列表 urls.append((topic, title, link, None)) # 如果主题页面有下一页,且已处理的文章条目未超过设定数量,则继续抓取下一页 next = soup.find(name='a', string='Next') if next and count < self.max_articles_per_feed: url = BaseFeedBook.urljoin(url, next.get('href')) self.ParsePageContent(topic, url, urls, count) # 如果请求失败则打印在日志输出中 else: self.log.warn('Fetch article failed(%s):%s' % \ (URLOpener.CodeMap(result.status_code), url))
def ParseFeedUrls(self): urls = [] # 定义一个空的列表用来存放文章元组 # 循环处理fees中两个主题页面 for feed in self.feeds: # 分别获取元组中主题的名称和链接 topic, url = feed[0], feed[1] # 请求主题链接并获取相应内容 opener = URLOpener(self.host, timeout=self.timeout) result = opener.open(url) # 如果请求成功,并且页面内容不为空 if result.status_code == 200 and result.content: # 将页面内容转换成BeatifulSoup对象 soup = BeautifulSoup(result.content, 'lxml') # 找出当前页面文章列表中所有文章条目 items=soup.find('div',class_='grid').find_all(name='div', class_='content') # 循环处理每个文章条目 for item in items: title = item.span.string # 获取文章标题 link = item.a.get('href') # 获取文章链接 link = BaseFeedBook.urljoin(url, link) # 合成文章链接 if self.OutTimeRange(item): continue urls.append((topic, title, link, None)) # 把文章元组加入列表 # 如果请求失败通知到日志输出中 else: self.log.warn('Fetch article failed(%s):%s' % \ (URLOpener.CodeMap(result.status_code), url)) # 返回提取到的所有文章列表 return urls
def GetNewComic(self): urls = [] if not self.feeds: return [] userName = self.UserName() decoder = AutoDecoder(isfeed=False) for item in self.feeds: title, url = item[0], item[1] lastCount = LastDelivered.all().filter( 'username = '******'These is no log in db LastDelivered for name: %s, set to 0' % title) oldNum = 0 else: oldNum = lastCount.num opener = URLOpener(self.host, timeout=60) result = opener.open(url) if result.status_code != 200: self.log.warn( 'fetch index page for %s failed[%s] : %s' % (title, URLOpener.CodeMap(result.status_code), url)) continue content = result.content content = self.AutoDecodeContent(content, decoder, self.feed_encoding, opener.realurl, result.headers) soup = BeautifulSoup(content, 'lxml') allComicTable = soup.find_all('table', {'width': '688'}) addedForThisComic = False for comicTable in allComicTable: comicVolumes = comicTable.find_all('a', {'target': '_blank'}) for volume in comicVolumes: texts = volume.text.split(' ') if len(texts) > 2 and texts[1].isdigit() and volume.get( 'href'): num = int(texts[1]) if num > oldNum: oldNum = num href = self.urljoin(self.host, volume.get('href')) urls.append((title, num, href)) addedForThisComic = True break #一次只推送一卷(有时候一卷已经很多图片了) if addedForThisComic: break return urls
def ParseFeedUrls(self): urls = [] # 定义一个空的列表用来存放文章元组 # 循环处理fees中两个主题页面 for feed in self.feeds: # 分别获取元组中主题的名称和链接 topic, url = feed[0], feed[1] # 请求主题链接并获取相应内容 opener = URLOpener(self.host, timeout=self.timeout) result = opener.open(url) # 如果请求成功,并且页面内容不为空 if result.status_code == 200 and result.content: # 将页面内容转换成BeatifulSoup对象 soup = BeautifulSoup(result.content, 'html.parser') # 找出当前页面文章列表中所有文章条目' sections = soup.find_all(name='div', class_='column-news') # self.log.warn('find %d sections' % len(sections)) for section in sections: tag = section.find(name='ul', class_='column-title') sectionName = tag.a.li.string tuwens = section.find_all( name='div', class_=re.compile("tuwen-block-")) # self.log.warn('%s find %d tuwen' % (sectionName, len(tuwens))) for tuwen in tuwens: articles = tuwen.find_all('a') title = '' link = '' for article in articles: if not article.img: title = article.string link = article.get('href') # 获取文章链接 self.log.warn('title : %s, link: %s' % (title, link)) break urls.append( (sectionName, title, link, None)) # 把文章元组加入列表 texts = section.find_all(name='li', class_=re.compile("list-text-")) # self.log.warn('%s find %d texts' % (sectionName, len(texts))) for text in texts: title = text.a.string link = text.a.get('href') # 获取文章链接 self.log.warn('title : %s, link: %s' % (title, link)) urls.append( (sectionName, title, link, None)) # 把文章元组加入列表 # 如果请求失败通知到日志输出中 else: self.log.warn('Fetch article failed(%s):%s' % (URLOpener.CodeMap(result.status_code), url)) # 返回提取到的所有文章列表 return urls
def ParsePageContent(self, topic, url, urls, count): # 请求主题页面链接并获取其内容 result = self.GetResponseContent(url) # 如果请求成功,并且页面内容不为空 if result.status_code == 200 and result.content: # 将页面内容转换成BeatifulSoup对象 soup = BeautifulSoup(result.content, 'lxml') # 找出当前页面文章列表中所有文章条目 items = soup.find_all(name='div', class_='col-md-12 border') #topics = soup.find_all(name='small', class_='text-muted') # 循环处理每个文章条目 for item in items: title = item.div.h4.em.string # 获取文章标题 link = item.a.get('onclick').split("'") # 获取文章链接 link = 'https://trends.lenovoresearch.cn/tst/article/article-detail/' + "?article_id=" + link[ 1] + "§ions=" + link[3] + "&dates=" + link[ 5] + "&sort=" + link[7] + "&search=" + link[ 9] + "&page=" + link[11] + "&web_source=" + topic #self.log.warn(item.find_all(name='em')[1].string) group = item.find_all(name='em')[1].string #topic = topics[2*count - 1].string #如果处理的文章条目超过了设定数量则中止抽取 if count > self.max_articles_per_feed: break # 如果文章发布日期超出了设定范围则忽略不处理 if self.OutTimeRange(item): break #如果文章发布期刊超出了设定则忽略不处理 if self.OutIssue(item): #self.log.warn(self.issue_number) continue count += 1 # 统计当前已处理的文章条目 # 将符合设定文章数量和时间范围的文章信息作为元组加入列表 urls.append((group, title, link, None)) # 如果主题页面有下一页,且已处理的文章条目未超过设定数量,则继续抓取下一页 next = soup.find_all(name='li', class_='page-item') #self.log.warn(next) if next[-1].span and count < self.max_articles_per_feed: #self.log.warn(temp) link = next[-1].a.get("href").replace(" ", "%20") links = 'https://trends.lenovoresearch.cn/tst/article/article-list/' + link #self.log.warn(links) self.ParsePageContent(topic, links, urls, count) # 如果请求失败则打印在日志输出中 else: self.log.warn('Fetch article failed(%s):%s' % \ (URLOpener.CodeMap(result.status_code), url))
def ParseFeedUrls(self): urls = [] userName = self.UserName() decoder = AutoDecoder(isfeed=False) lastCount = LastDelivered.all().filter('username = '******'' else: oldNum = lastCount.num oldChapterTitle = lastCount.record opener = URLOpener(self.host, timeout=60) result = opener.open(self.feeds) if result.status_code != 200: self.log.warn('fetch index page for %s failed[%s] : %s' % (self.title, URLOpener.CodeMap( result.status_code), self.feeds)) return [] # 从页面获取章节列表 content = self.AutoDecodeContent(result.content, decoder, self.feed_encoding, opener.realurl, result.headers) soup = BeautifulSoup(content, 'lxml') chapterList = self.GetChapterList(soup) chapterNum = 0 for chapter in chapterList: if chapterNum >= self.limit: break url = chapter.get('href') num = self.GetChapterNum(url) if num > oldNum: oldNum = num oldChapterTitle = chapter.text chapterNum += 1 urls.append( (self.title, oldChapterTitle, self.urljoin(self.host, url), '')) self.UpdateLastDelivered(self.title, oldNum, oldChapterTitle) return urls
def fetch(self, url, opener, decoder): """链接网络,下载网页并解码""" result = opener.open(url) status_code, content = result.status_code, result.content if status_code not in (200, 206) or not content: self.log.warn('fetch page failed(%s):%s.' % (URLOpener.CodeMap(status_code), url)) return None #debug_mail(content) if self.page_encoding: try: return content.decode(self.page_encoding) except UnicodeDecodeError: return decoder.decode(content, opener.realurl, result.headers) else: return decoder.decode(content, opener.realurl, result.headers)
def GET(self): user = self.getcurrentuser(forAjax=True) web.header('Content-Type', 'application/json') #连接分享服务器获取数据 respDict = {'status': 'ok', 'categories': []} opener = URLOpener() url = urlparse.urljoin( 'http://kindleear.appspot.com/', SharedLibraryCategorykindleearAppspotCom.__url__) result = opener.open(url + '?key=kindleear.lucky!') if result.status_code == 200 and result.content: respDict['categories'] = json.loads(result.content) else: respDict['status'] = _( 'Cannot fetch data from kindleear.appspot.com, status: ' ) + URLOpener.CodeMap(result.status_code) return json.dumps(respDict)
def ParsePageLinks(self, topic, url, urls, count, count2, ccc): # 请求主题页面或章节列表页面的链接和内容 result = self.GetResponseContent(url) # 如果请求成功,并且页面内容不为空 if result.status_code == 200 and result.content: # 将主题或列表页面内容转换成BeatifulSoup对象 soup = BeautifulSoup(result.content, 'lxml') # 找出当前页面文章列表中所有文章条目,里面的标签参数需要手工修改确认 items = soup.find_all(name='dd') #获取总章节数,以便抓取最新章节,追更---应该有个函数能使用,可惜我不知道啊 for ttt in items: ccc += 1 # 循环处理每个文章条目 for item in items: title = item.a.string # 获取文章标题 link = item.a.get('href') # 获取文章链接 link = BaseFeedBook.urljoin(url, link) # 合成文章链接 count += 1 # 统计当前已处理的文章条目 # 如果处理的文章条目超过了设定数量则中止抽取,改动下面的条件限制,选择抓取方式,都屏蔽掉,则抓全部 count2 = count + self.max_articles_per_feed if count2 < ccc: #一、从最后抓n章 continue #if count > self.max_articles_per_feed: #二、从前面抓n章 # break # 将符合设定文章数量的文章信息作为元组加入列表 urls.append((topic, title, link, None)) # 如果主题页面有下一页,且已处理的文章条目未超过设定数量,则继续抓取下一页,递进调用自己 #next = soup.find(name='a', string='Next') #if next and count < self.max_articles_per_feed: #url = BaseFeedBook.urljoin(url, next.get('href')) #self.ParsePageLinks(topic, url, urls, count) # 如果请求失败则打印在日志输出中 else: self.log.warn('Fetch article failed(%s):%s' % \ (URLOpener.CodeMap(result.status_code), url))
def GET(self): user = self.getcurrentuser() #连接分享服务器获取数据 shared_data = [] tips = '' opener = URLOpener() url = urlparse.urljoin('http://kindleear.appspot.com/', SharedLibrarykindleearAppspotCom.__url__) result = opener.open(url + '?key=kindleear.lucky!') if result.status_code == 200 and result.content: shared_data = json.loads(result.content) else: tips = _('Cannot fetch data from kindleear.appspot.com, status: ' ) + URLOpener.CodeMap(result.status_code) return self.render('sharedlibrary.html', "Shared", current='shared', user=user, shared_data=shared_data, tips=tips)
def ParseFeedUrls(self): urls = [] # 定义一个空的列表用来存放文章元组 # 循环处理fees中两个主题页面 for feed in self.feeds: # 分别获取元组中主题的名称和链接 topic, url = feed[0], feed[1] # 请求主题链接并获取相应内容 opener = URLOpener(self.host, timeout=self.timeout) result = opener.open(url) # 如果请求成功,并且页面内容不为空 if result.status_code == 200 and result.content: # 将页面内容转换成BeatifulSoup对象 soup = BeautifulSoup(result.content, 'html.parser') # self.log.warn('title : %s' % soup.title) # 找出当前页面文章列表中所有文章条目' items = soup.find_all(name='div', class_="content") self.log.warn('find : %d articles.' % len(items)) # 循环处理每个文章条目 count = 0 for item in items: title = item.a.string # 获取文章标题 link = item.a.get('href') # 获取文章链接 link = BaseFeedBook.urljoin("https://toutiao.io", link) # 合成文章链接 link = self.getRealUrl (link) self.log.warn('Fetch article : %s' % link) if string.find (link, 'zhihu.com') != -1: link = self.url4forwarder(url) self.log.warn('transport : %s' % link) urls.append((topic, title, link, None)) # 把文章元组加入列表 count = count + 1 if count >= 30 : break # 如果请求失败通知到日志输出中 else: self.log.warn('Fetch article failed(%s):%s' % \ (URLOpener.CodeMap(result.status_code), url)) # 返回提取到的所有文章列表 return urls
def ParseFeedUrls(self): urls = [] # 定义一个空的列表用来存放文章元组 # 循环处理fees中两个主题页面 for feed in self.feeds: # 分别获取元组中主题的名称和链接 topic, url = feed[0], feed[1] # 请求主题链接并获取相应内容 opener = URLOpener(self.host, timeout=self.timeout) result = opener.open(url) # 如果请求成功,并且页面内容不为空 if result.status_code == 200 and result.content: # 将页面内容转换成BeatifulSoup对象 soup = BeautifulSoup(result.content, 'html.parser') # 找出当前页面文章列表中所有文章条目' item = soup.find(name='dd') count = 0 while item: # 只获取最新更新章节 if item.name != 'dd': break title = item.a.string # 获取文章标题 link = item.a.get('href') # 获取文章链接 link = BaseFeedBook.urljoin( "https://www.72wx.com", link) # 合成文章链接 urls.insert(0, (topic, title, link, None)) # 把文章元组加入列表 count = count + 1 if count >= 20: break item = item.next_sibling while type(item) != element.Tag: item = item.next_sibling # 如果请求失败通知到日志输出中 else: self.log.warn('Fetch article failed(%s):%s' % (URLOpener.CodeMap(result.status_code), url)) # 返回提取到的所有文章列表 return urls
def ParseFeedUrls(self): """ return list like [(section,title,url,desc),..] """ urls = [] urladded = set() url = self.url4forwarder(self.feeds[0][1]) opener = URLOpener(self.host, timeout=self.timeout) result = opener.open(url) if result.status_code == 200 and result.content: feed = json.loads(result.content.decode(self.feed_encoding)) for partition, section in self.partitions: for item in feed[partition]: urlfeed = item['share_url'] if urlfeed in urladded: self.log.info('duplicated, skipped %s' % urlfeed) continue urls.append((section, item['title'], self.url4forwarder(urlfeed), None)) urladded.add(urlfeed) else: self.log.warn('fetch rss failed(%s):%s' % (URLOpener.CodeMap(result.status_code), url)) return urls
def POST(self, mgrType): user = self.getcurrentuser(forAjax=True) if mgrType == 'reportinvalid': #报告一个源失效了 web.header('Content-Type', 'application/json') title = web.input().get('title', '') feedUrl = web.input().get('url', '') opener = URLOpener() path = SharedLibraryMgrkindleearAppspotCom.__url__.split('/') path[-1] = mgrType srvUrl = urlparse.urljoin('http://kindleear.appspot.com/', '/'.join(path)) data = {'title': title, 'url': feedUrl, 'key': 'kindleear.lucky!'} result = opener.open(srvUrl, data) if result.status_code == 200 and result.content: return result.content else: return json.dumps({ 'status': _('Cannot fetch data from kindleear.appspot.com, status: ') + URLOpener.CodeMap(result.status_code) }) else: return json.dumps({'status': 'unknown command: %s' % mgrType})
def ParseFeedUrls(self): """ return list like [(section,title,url,desc),..] """ urls = [] tnow = datetime.datetime.utcnow() urladded = set() for feed in self.feeds: section, url = feed[0], feed[1] isfulltext = feed[2] if len(feed) > 2 else False timeout = self.timeout+10 if isfulltext else self.timeout opener = URLOpener(self.host, timeout=timeout) id = urlparse.urlparse(url).query.split('=')[1] result = opener.open(url) if result.status_code == 200 and result.content: if self.feed_encoding: try: content = result.content.decode(self.feed_encoding) except UnicodeDecodeError: content = AutoDecoder(True).decode(result.content,opener.realurl,result.headers) else: content = AutoDecoder(True).decode(result.content,opener.realurl,result.headers) else: self.log.warn('fetch rss failed(%s):%s' % (URLOpener.CodeMap(result.status_code), url)) continue eqs, ekv = process_eqs(content) url = WEIXIN_URL.format(id=id, eqs=urllib.quote(eqs), ekv=ekv, t=int(time.time()*1000)) result = opener.open(url) if result.status_code == 200 and result.content: if self.feed_encoding: try: content = result.content.decode(self.feed_encoding) except UnicodeDecodeError: content = AutoDecoder(True).decode(result.content,opener.realurl,result.headers) else: content = AutoDecoder(True).decode(result.content,opener.realurl,result.headers) content = content[content.find('{'):content.rfind('}')+1] try: content = json.loads(content) except ValueError: continue for e in content['items'][:self.max_articles_per_feed]: e = feedparser.parse(e)['entries'][0] updated = None if hasattr(e, 'lastmodified') and e.lastmodified: updated = float(e.lastmodified) if self.oldest_article > 0 and updated: updated = datetime.datetime.utcfromtimestamp(updated) delta = tnow - updated if self.oldest_article > 365: threshold = self.oldest_article #以秒为单位 else: threshold = 86400*self.oldest_article #以天为单位 if delta.days*86400+delta.seconds > threshold: self.log.info("Skip old article(%s): %s" % (updated.strftime('%Y-%m-%d %H:%M:%S'),e.href)) continue #支持HTTPS if hasattr(e, 'href'): if url.startswith('https://'): urlfeed = e.href.replace('http://','https://') else: urlfeed = e.href if urlfeed in urladded: continue else: urlfeed = '' desc = None urls.append((section, e.title, urlfeed, desc)) urladded.add(urlfeed) else: self.log.warn('fetch rss failed(%s):%s' % (URLOpener.CodeMap(result.status_code), url)) return urls
def readability_by_soup(self, article, url, opts=None, user=None): """ 使用BeautifulSoup手动解析网页,提取正文内容 因为图片文件占内存,为了节省内存,这个函数也做为生成器 """ content = self.preprocess(article) soup = BeautifulSoup(content, "lxml") try: title = soup.html.head.title.string except AttributeError: self.log.warn('object soup invalid!(%s)' % url) return if not title: self.log.warn('article has no title.[%s]' % url) return title = self.processtitle(title) soup.html.head.title.string = title if self.keep_only_tags: body = soup.new_tag('body') try: if isinstance(self.keep_only_tags, dict): keep_only_tags = [self.keep_only_tags] else: keep_only_tags = self.keep_only_tags for spec in keep_only_tags: for tag in soup.find('body').find_all(**spec): body.insert(len(body.contents), tag) soup.find('body').replace_with(body) except AttributeError: # soup has no body element pass for spec in self.remove_tags_after: tag = soup.find(**spec) remove_beyond(tag, 'next_sibling') for spec in self.remove_tags_before: tag = soup.find(**spec) remove_beyond(tag, 'previous_sibling') remove_tags = self.insta_remove_tags + self.remove_tags remove_ids = self.insta_remove_ids + self.remove_ids remove_classes = self.insta_remove_classes + self.remove_classes remove_attrs = self.insta_remove_attrs + self.remove_attrs for tag in soup.find_all(remove_tags): tag.decompose() for id in remove_ids: for tag in soup.find_all(attrs={"id": id}): tag.decompose() for cls in remove_classes: for tag in soup.find_all(attrs={"class": cls}): tag.decompose() for attr in remove_attrs: for tag in soup.find_all(attrs={attr: True}): del tag[attr] for cmt in soup.find_all(text=lambda text: isinstance(text, Comment)): cmt.extract() if self.extra_css: sty = soup.new_tag('style', type="text/css") sty.string = self.extra_css soup.html.head.append(sty) self.soupbeforeimage(soup) has_imgs = False thumbnail = None if self.keep_image: opener = URLOpener(self.host, timeout=self.timeout) for img in soup.find_all('img'): #现在使用延迟加载图片技术的网站越来越多了,这里处理一下 #注意:如果data-src之类的属性保存的不是真实url就没辙了 imgurl = img['src'] if 'src' in img.attrs else '' if not imgurl: for attr in img.attrs: if attr != 'src' and 'src' in attr: #很多网站使用data-src imgurl = img[attr] break if not imgurl: img.decompose() continue if not imgurl.startswith('data:'): if not imgurl.startswith('http'): imgurl = self.urljoin(url, imgurl) if self.fetch_img_via_ssl and url.startswith('https://'): imgurl = imgurl.replace('http://', 'https://') if self.isfiltered(imgurl): self.log.warn('img filtered:%s' % imgurl) img.decompose() continue imgresult = opener.open(imgurl) imgcontent = self.process_image( imgresult.content, opts) if imgresult.status_code == 200 else None if imgcontent: if len(imgcontent ) < self.img_min_size: #rexdf too small image img.decompose() continue imgtype = imghdr.what(None, imgcontent) if imgtype: imgmime = r"image/" + imgtype fnimg = "img%d.%s" % (self.imgindex, 'jpg' if imgtype == 'jpeg' else imgtype) img['src'] = fnimg #使用第一个图片做为目录缩略图 if not has_imgs: has_imgs = True thumbnail = imgurl yield (imgmime, imgurl, fnimg, imgcontent, None, True) else: yield (imgmime, imgurl, fnimg, imgcontent, None, None) else: img.decompose() else: self.log.warn( 'fetch img failed(%s):%s' % (URLOpener.CodeMap(imgresult.status_code), imgurl)) img.decompose() #去掉图像上面的链接,以免误触后打开浏览器 for img in soup.find_all('img'): if img.parent and img.parent.parent and \ img.parent.name == 'a': img.parent.replace_with(img) else: for img in soup.find_all('img'): img.decompose() #如果没有内容标题则添加 body = soup.html.body t = body.find(['h1', 'h2']) if not t: t = soup.new_tag('h2') t.string = title body.insert(0, t) else: totallen = 0 for ps in t.previous_siblings: totallen += len(string_of_tag(ps)) if totallen > 40: #此H1/H2在文章中间出现,不是文章标题 t = soup.new_tag('h2') t.string = title body.insert(0, t) break #删除body的所有属性,以便InsertToc使用正则表达式匹配<body> bodyattrs = [attr for attr in body.attrs] for attr in bodyattrs: del body[attr] #将HTML5标签转换为div for x in soup.find_all([ 'article', 'aside', 'header', 'footer', 'nav', 'figcaption', 'figure', 'section', 'time' ]): x.name = 'div' self.soupprocessex(soup) #插入分享链接 if user: self.AppendShareLinksToArticle(soup, user, url) content = unicode(soup) #提取文章内容的前面一部分做为摘要 brief = u'' if GENERATE_TOC_DESC: for h in body.find_all(['h1', 'h2']): # 去掉h1/h2,避免和标题重复 h.decompose() for s in body.stripped_strings: brief += unicode(s) + u' ' if len(brief) >= TOC_DESC_WORD_LIMIT: brief = brief[:TOC_DESC_WORD_LIMIT] break soup = None yield (title, None, None, content, brief, thumbnail)
def readability(self, article, url, opts=None, user=None): """ 使用readability-lxml处理全文信息 因为图片文件占内存,为了节省内存,这个函数也做为生成器 """ content = self.preprocess(article) if not content: return # 提取正文 try: doc = readability.Document(content, positive_keywords=self.positive_classes) summary = doc.summary(html_partial=False) except: # 如果提取正文出错,可能是图片(一个图片做为一篇文章,没有使用html包装) imgtype = imghdr.what(None, content) if imgtype: #如果是图片,则使用一个简单的html做为容器 imgmime = r"image/" + imgtype fnimg = "img%d.%s" % (self.imgindex, 'jpg' if imgtype == 'jpeg' else imgtype) yield (imgmime, url, fnimg, content, None, None) tmphtml = '<html><head><title>Picture</title></head><body><img src="%s" /></body></html>' % fnimg yield ('Picture', None, None, tmphtml, '', None) else: self.log.warn('article is invalid.[%s]' % url) return title = doc.short_title() if not title: self.log.warn('article has no title.[%s]' % url) return title = self.processtitle(title) soup = BeautifulSoup(summary, "lxml") #如果readability解析失败,则启用备用算法(不够好,但有全天候适应能力) body = soup.find('body') head = soup.find('head') if len(body.contents) == 0: from simpleextract import simple_extract summary = simple_extract(content) soup = BeautifulSoup(summary, "lxml") body = soup.find('body') if not body: self.log.warn('extract article content failed.[%s]' % url) return head = soup.find('head') #增加备用算法提示,提取效果不好不要找我,类似免责声明:) info = soup.new_tag( 'p', style='color:#555555;font-size:60%;text-align:right;') info.string = 'extracted by alternative algorithm.' body.append(info) self.log.info('use alternative algorithm to extract content.') if not head: head = soup.new_tag('head') soup.html.insert(0, head) if not head.find('title'): t = soup.new_tag('title') t.string = title head.append(t) #如果没有内容标题则添加 t = body.find(['h1', 'h2']) if not t: t = soup.new_tag('h2') t.string = title body.insert(0, t) else: totallen = 0 for ps in t.previous_siblings: totallen += len(string_of_tag(ps)) if totallen > 40: #此H1/H2在文章中间出现,不是文章标题 t = soup.new_tag('h2') t.string = title body.insert(0, t) break if self.remove_tags: for tag in soup.find_all(self.remove_tags): tag.decompose() for id in self.remove_ids: for tag in soup.find_all(attrs={"id": id}): tag.decompose() for cls in self.remove_classes: for tag in soup.find_all(attrs={"class": cls}): tag.decompose() for attr in self.remove_attrs: for tag in soup.find_all(attrs={attr: True}): del tag[attr] for cmt in soup.find_all(text=lambda text: isinstance(text, Comment)): cmt.extract() #删除body的所有属性,以便InsertToc使用正则表达式匹配<body> bodyattrs = [attr for attr in body.attrs] for attr in bodyattrs: del body[attr] if self.extra_css: sty = soup.new_tag('style', type="text/css") sty.string = self.extra_css soup.html.head.append(sty) self.soupbeforeimage(soup) has_imgs = False thumbnail = None if self.keep_image: opener = URLOpener(self.host, timeout=self.timeout) for img in soup.find_all('img'): #现在使用延迟加载图片技术的网站越来越多了,这里处理一下 #注意:如果data-src之类的属性保存的不是真实url就没辙了 imgurl = img['src'] if 'src' in img.attrs else '' if not imgurl: for attr in img.attrs: if attr != 'src' and 'src' in attr: #很多网站使用data-src imgurl = img[attr] break if not imgurl: img.decompose() continue if not imgurl.startswith('data:'): if not imgurl.startswith('http'): imgurl = self.urljoin(url, imgurl) if self.fetch_img_via_ssl and url.startswith('https://'): imgurl = imgurl.replace('http://', 'https://') if self.isfiltered(imgurl): self.log.warn('img filtered : %s' % imgurl) img.decompose() continue imgresult = opener.open(imgurl) imgcontent = self.process_image( imgresult.content, opts) if imgresult.status_code == 200 else None if imgcontent: if len(imgcontent ) < self.img_min_size: #rexdf too small image img.decompose() continue imgtype = imghdr.what(None, imgcontent) if imgtype: imgmime = r"image/" + imgtype fnimg = "img%d.%s" % (self.imgindex, 'jpg' if imgtype == 'jpeg' else imgtype) img['src'] = fnimg #使用第一个图片做为目录缩略图 if not has_imgs: has_imgs = True thumbnail = imgurl yield (imgmime, imgurl, fnimg, imgcontent, None, True) else: yield (imgmime, imgurl, fnimg, imgcontent, None, None) else: img.decompose() else: self.log.warn( 'fetch img failed(%s):%s' % (URLOpener.CodeMap(imgresult.status_code), imgurl)) img.decompose() #去掉图像上面的链接,以免误触后打开浏览器 for img in soup.find_all('img'): if img.parent and img.parent.parent and \ img.parent.name == 'a': img.parent.replace_with(img) else: for img in soup.find_all('img'): img.decompose() #将HTML5标签转换为div for x in soup.find_all([ 'article', 'aside', 'header', 'footer', 'nav', 'figcaption', 'figure', 'section', 'time' ]): x.name = 'div' self.soupprocessex(soup) #插入分享链接 if user: self.AppendShareLinksToArticle(soup, user, url) content = unicode(soup) #提取文章内容的前面一部分做为摘要 brief = u'' if GENERATE_TOC_DESC: for h in body.find_all(['h1', 'h2']): # 去掉h1/h2,避免和标题重复 h.decompose() for s in body.stripped_strings: brief += unicode(s) + u' ' if len(brief) >= TOC_DESC_WORD_LIMIT: brief = brief[:TOC_DESC_WORD_LIMIT] break soup = None yield (title, None, None, content, brief, thumbnail)
def ParseFeedUrls(self): """ return list like [(section,title,url,desc),..] """ urls = [] tnow = datetime.datetime.utcnow() urladded = set() for feed in self.feeds: section, url = feed[0], feed[1] isfulltext = feed[2] if len(feed) > 2 else False timeout = self.timeout + 10 if isfulltext else self.timeout opener = URLOpener(self.host, timeout=timeout) result = opener.open(url) if result.status_code == 200 and result.content: #debug_mail(result.content, 'feed.xml') if self.feed_encoding: try: content = result.content.decode(self.feed_encoding) except UnicodeDecodeError: content = AutoDecoder(True).decode( result.content, opener.realurl, result.headers) else: content = AutoDecoder(True).decode(result.content, opener.realurl, result.headers) feed = feedparser.parse(content) for e in feed['entries'][:self.max_articles_per_feed]: updated = None if hasattr(e, 'updated_parsed') and e.updated_parsed: updated = e.updated_parsed elif hasattr(e, 'published_parsed') and e.published_parsed: updated = e.published_parsed elif hasattr(e, 'created_parsed'): updated = e.created_parsed if self.oldest_article > 0 and updated: updated = datetime.datetime(*(updated[0:6])) delta = tnow - updated if self.oldest_article > 365: threshold = self.oldest_article #以秒为单位 else: threshold = 86400 * self.oldest_article #以天为单位 if delta.days * 86400 + delta.seconds > threshold: self.log.info( "Skip old article(%s): %s" % (updated.strftime('%Y-%m-%d %H:%M:%S'), e.link)) continue #支持HTTPS if hasattr(e, 'link'): if url.startswith('https://'): urlfeed = e.link.replace('http://', 'https://') else: urlfeed = e.link if urlfeed in urladded: continue else: urlfeed = '' desc = None if isfulltext: summary = e.summary if hasattr(e, 'summary') else None desc = e.content[0]['value'] if (hasattr( e, 'content') and e.content[0]['value']) else None #同时存在,因为有的RSS全文内容放在summary,有的放在content #所以认为内容多的为全文 if summary and desc: desc = summary if len(summary) > len( desc) else desc elif summary: desc = summary if not desc: if not urlfeed: continue else: self.log.warn( 'fulltext feed item no has desc,link to webpage for article.(%s)' % e.title) urls.append((section, e.title, urlfeed, desc)) urladded.add(urlfeed) else: self.log.warn('fetch rss failed(%s):%s' % (URLOpener.CodeMap(result.status_code), url)) return urls
def Items(self, opts=None, user=None): """ 生成器,返回一个元组 对于HTML:section,url,title,content,brief,thumbnail 对于图片,mime,url,filename,content,brief,thumbnail 如果是图片,仅第一个图片的thumbnail返回True,其余为None """ decoder = AutoDecoder(False) timeout = self.timeout for section, url in self.feeds: opener = URLOpener(self.host, timeout=timeout) result = opener.open(url) status_code, content = result.status_code, result.content if status_code != 200 or not content: self.log.warn('fetch article failed(%s):%s.' % (URLOpener.CodeMap(status_code), url)) continue if self.page_encoding: try: content = content.decode(self.page_encoding) except UnicodeDecodeError: content = decoder.decode(content, opener.realurl, result.headers) else: content = decoder.decode(content, opener.realurl, result.headers) content = self.preprocess(content) soup = BeautifulSoup(content, "lxml") head = soup.find('head') if not head: head = soup.new_tag('head') soup.html.insert(0, head) if not head.find('title'): t = soup.new_tag('title') t.string = section head.append(t) try: title = soup.html.head.title.string except AttributeError: title = section #self.log.warn('object soup invalid!(%s)'%url) #continue title = self.processtitle(title) if self.keep_only_tags: body = soup.new_tag('body') try: if isinstance(self.keep_only_tags, dict): keep_only_tags = [self.keep_only_tags] else: keep_only_tags = self.keep_only_tags for spec in keep_only_tags: for tag in soup.find('body').find_all(**spec): body.insert(len(body.contents), tag) soup.find('body').replace_with(body) except AttributeError: # soup has no body element pass for spec in self.remove_tags_after: tag = soup.find(**spec) remove_beyond(tag, 'next_sibling') for spec in self.remove_tags_before: tag = soup.find(**spec) remove_beyond(tag, 'previous_sibling') remove_tags = self.insta_remove_tags + self.remove_tags remove_ids = self.insta_remove_ids + self.remove_ids remove_classes = self.insta_remove_classes + self.remove_classes remove_attrs = self.insta_remove_attrs + self.remove_attrs for tag in soup.find_all(remove_tags): tag.decompose() for id in remove_ids: for tag in soup.find_all(attrs={"id": id}): tag.decompose() for cls in remove_classes: for tag in soup.find_all(attrs={"class": cls}): tag.decompose() for attr in remove_attrs: for tag in soup.find_all(attrs={attr: True}): del tag[attr] for cmt in soup.find_all( text=lambda text: isinstance(text, Comment)): cmt.extract() #删除body的所有属性,以便InsertToc使用正则表达式匹配<body> body = soup.html.body bodyattrs = [attr for attr in body.attrs] for attr in bodyattrs: del body[attr] if self.extra_css: sty = soup.new_tag('style', type="text/css") sty.string = self.extra_css soup.html.head.append(sty) has_imgs = False thumbnail = None if self.keep_image: self.soupbeforeimage(soup) for img in soup.find_all('img'): #现在使用延迟加载图片技术的网站越来越多了,这里处理一下 #注意:如果data-src之类的属性保存的不是真实url就没辙了 imgurl = img['src'] if 'src' in img.attrs else '' if not imgurl: for attr in img.attrs: if attr != 'src' and 'src' in attr: #很多网站使用data-src imgurl = img[attr] break if not imgurl: img.decompose() continue if not imgurl.startswith('data:'): if not imgurl.startswith('http'): imgurl = self.urljoin(url, imgurl) if self.fetch_img_via_ssl and url.startswith( 'https://'): imgurl = imgurl.replace('http://', 'https://') if self.isfiltered(imgurl): self.log.warn('img filtered:%s' % imgurl) img.decompose() continue imgresult = opener.open(imgurl) imgcontent = self.process_image( imgresult.content, opts) if imgresult.status_code == 200 else None if imgcontent: if len(imgcontent ) < self.img_min_size: #rexdf too small image img.decompose() continue imgtype = imghdr.what(None, imgcontent) if imgtype: imgmime = r"image/" + imgtype fnimg = "img%d.%s" % (self.imgindex, 'jpg' if imgtype == 'jpeg' else imgtype) img['src'] = fnimg #使用第一个图片做为目录摘要图 if not has_imgs: has_imgs = True thumbnail = imgurl yield (imgmime, imgurl, fnimg, imgcontent, None, True) else: yield (imgmime, imgurl, fnimg, imgcontent, None, None) else: img.decompose() else: self.log.warn( 'fetch img failed(%s):%s' % (URLOpener.CodeMap(imgresult.status_code), imgurl)) img.decompose() #去掉图像上面的链接 for img in soup.find_all('img'): if img.parent and img.parent.parent and \ img.parent.name == 'a': img.parent.replace_with(img) else: for img in soup.find_all('img'): img.decompose() self.soupprocessex(soup) content = unicode(soup) #提取文章内容的前面一部分做为摘要 brief = u'' if GENERATE_TOC_DESC: for h in body.find_all(['h1', 'h2']): # 去掉h1/h2,避免和标题重复 h.decompose() for s in body.stripped_strings: brief += unicode(s) + u' ' if len(brief) >= TOC_DESC_WORD_LIMIT: brief = brief[:TOC_DESC_WORD_LIMIT] break soup = None content = self.postprocess(content) yield (section, url, title, content, brief, thumbnail)