def Items(self, opts=None, user=None): """ 生成器,返回一个元组 对于HTML:section,url,title,content,brief,thumbnail 对于图片,mime,url,filename,content,brief,thumbnail """ urls = self.ParseFeedUrls() readability = self.readability if self.fulltext_by_readability else self.readability_by_soup prevsection = '' opener = URLOpener(self.host, timeout=self.timeout) decoder = AutoDecoder(False) for section, ftitle, url, desc in urls: if not desc: #非全文RSS if section != prevsection or prevsection == '': decoder.encoding = '' #每个小节都重新检测编码 prevsection = section opener = URLOpener(self.host, timeout=self.timeout) if self.needs_subscription: self.login(opener, decoder) article = self.fetcharticle(url, opener, decoder) if not article: continue else: article = self.FragToXhtml(desc, ftitle) #如果是图片,title则是mime for title, imgurl, imgfn, content, brief, thumbnail in readability(article,url,opts,user,ftitle): if title.startswith(r'image/'): #图片 yield (title, imgurl, imgfn, content, brief, thumbnail) else: if not title: title = ftitle if self.force_ftitle:title = ftitle content = self.postprocess(content) yield (section, url, title, content, brief, thumbnail)
def Items(self, opts=None, user=None): """ 生成器,返回一个元组 对于HTML:section,url,title,content,brief 对于图片,mime,url,filename,content,brief """ urls = self.ParseFeedUrls() readability = self.readability if self.fulltext_by_readability else self.readability_by_soup prevsection = '' decoder = AutoDecoder(False) for section, ftitle, url, desc in urls: if not desc: #非全文RSS if section != prevsection or prevsection == '': decoder.encoding = '' #每个小节都重新探测编码 prevsection = section article = self.fetcharticle(url, decoder) if not article: continue else: article = self.FragToXhtml(desc, ftitle) #如果是图片,title则是mime for title, imgurl, imgfn, content, brief in readability(article,url,opts,user): if title.startswith(r'image/'): #图片 yield (title, imgurl, imgfn, content, brief) else: if not title: title = ftitle content = self.postprocess(content) yield (section, url, title, content, brief)
def Items(self, opts=None, user=None): """ 生成器,返回一个元组 对于HTML:section,url,title,content,brief 对于图片,mime,url,filename,content,brief """ urls = self.ParseFeedUrls() readability = self.readability if self.fulltext_by_readability else self.readability_by_soup prevsection = '' decoder = AutoDecoder(False) for section, ftitle, url, desc in urls: if not desc: #非全文RSS if section != prevsection or prevsection == '': decoder.encoding = '' #每个小节都重新探测编码 prevsection = section article = self.fetcharticle(url, decoder) if not article: continue else: article = self.FragToXhtml(desc, ftitle) #如果是图片,title则是mime for title, imgurl, imgfn, content, brief in readability( article, url, opts, user): if title.startswith(r'image/'): #图片 yield (title, imgurl, imgfn, content, brief) else: if not title: title = ftitle content = self.postprocess(content) yield (section, url, title, content, brief)
def Items(self, opts=None, user=None): """ 生成器,返回一个元组 对于HTML:section,url,title,content,brief,thumbnail 对于图片,mime,url,filename,content,brief,thumbnail """ urls = self.ParseFeedUrls() readability = self.readability if self.fulltext_by_readability else self.readability_by_soup prevsection = '' opener = URLOpener(self.host, timeout=self.timeout) decoder = AutoDecoder(False) for section, ftitle, url, desc in urls: if not desc: #非全文RSS if section != prevsection or prevsection == '': decoder.encoding = '' #每个小节都重新检测编码 prevsection = section opener = URLOpener(self.host, timeout=self.timeout) if self.needs_subscription: self.login(opener, decoder) article = self.fetcharticle(url, opener, decoder) if not article: continue else: article = self.FragToXhtml(desc, ftitle) #如果是图片,title则是mime for title, imgurl, imgfn, content, brief, thumbnail in readability( article, url, opts, user): if title.startswith(r'image/'): #图片 yield (title, imgurl, imgfn, content, brief, thumbnail) else: if user and user.use_title_in_feed: title = ftitle elif not title: title = ftitle content = self.postprocess(content) yield (section, url, title, content, brief, thumbnail)
def Items(self): """ 生成器,返回一个元组 对于HTML:section,url,title,content 对于图片,mime,url,filename,content """ urls = self.ParseFeedUrls() readability = self.readability if self.fulltext_by_readability else self.readability_by_soup prevsection = '' decoder = AutoDecoder() if USE_ASYNC_URLFETCH: async = AsyncURLFetchManager() #对于非全文RSS文章启动异步Fetch rpcs = {url:async.fetch_async(url) for _a,_b,url,desc in urls if not desc} #为了效率起见,先处理全文RSS for section, ftitle, url, desc in urls: if not desc: continue article = self.FragToXhtml(desc, ftitle) #如果是图片,title则是mime for title, imgurl, imgfn, content, brief in readability(article,url): if title.startswith(r'image/'): #图片 yield (title, imgurl, imgfn, content, brief) else: if not title: title = ftitle content = self.postprocess(content) yield (section, url, title, content, brief) #再出来非全文RSS for section, ftitle, url, desc in urls: if desc: continue if section != prevsection or prevsection == '': decoder.encoding = '' #每个小节都重新探测编码 prevsection = section try: resp = rpcs[url].get_result() except urlfetch.DownloadError, e: self.log.warn(str(e)) continue except Exception,e: self.log.warn('%s:%s.' % (str(e), url)) continue status_code, content = resp.status_code, resp.content if status_code != 200 or not content: self.log.warn('async fetch article failed(%d):%s.' % (status_code,url)) continue if self.page_encoding: article = content.decode(self.page_encoding) else: article = decoder.decode(content) #如果是图片,title则是mime for title, imgurl, imgfn, content, brief in readability(article,url): if title.startswith(r'image/'): #图片 yield (title, imgurl, imgfn, content, brief) else: if not title: title = ftitle content = self.postprocess(content) yield (section, url, title, content, brief)
yield (section, url, title, content, brief) else: #同步UrlFetch方式 for section, ftitle, url, desc in urls: if not desc: #非全文RSS if section != prevsection or prevsection == '': decoder.encoding = '' #每个小节都重新探测编码 prevsection = section article = self.fetcharticle(url, decoder) if not article: continue else: article = self.FragToXhtml(desc, ftitle) #如果是图片,title则是mime for title, imgurl, imgfn, content, brief in readability(article,url): if title.startswith(r'image/'): #图片 yield (title, imgurl, imgfn, content, brief) else: if not title: title = ftitle content = self.postprocess(content) yield (section, url, title, content, brief) def fetcharticle(self, url, decoder): #使用同步方式获取一篇文章 if self.fulltext_by_instapaper and not self.fulltext_by_readability: url = "http://www.instapaper.com/m?u=%s" % self.url_unescape(url) opener = URLOpener(self.host) result = opener.open(url)
def Items(self, opts=None): """ 生成器,返回一个元组 对于HTML:section,url,title,content,brief 对于图片,mime,url,filename,content,brief """ urls = self.ParseFeedUrls() readability = self.readability if self.fulltext_by_readability else self.readability_by_soup prevsection = '' decoder = AutoDecoder(False) if USE_ASYNC_URLFETCH: #启动异步下载 asyncopener = AsyncURLOpener(self.log) rpcs = [asyncopener.fetch(url,self.timeout,sec,title) for sec,title,url,desc in urls if not desc] #为了效率起见,先处理全文RSS #在处理全文RSS的时候,其他RSS在后台拼命下载中... for section, ftitle, url, desc in urls: if not desc: continue article = self.FragToXhtml(desc, ftitle) #如果是图片,title则是mime for title, imgurl, imgfn, content, brief in readability(article,url,opts): if title.startswith(r'image/'): #图片 yield (title, imgurl, imgfn, content, brief) else: if not title: title = ftitle content = self.postprocess(content) yield (section, url, title, content, brief) #轮到摘要RSS了 for result,url,(section,ftitle) in asyncopener.get_result(): if section != prevsection or prevsection == '': decoder.encoding = '' #每个小节都重新探测编码 prevsection = section status_code, content = result.status_code, result.content if status_code != 200 or not content: self.log.warn('async fetch article failed(%d):%s.' % (status_code,url)) continue if self.page_encoding: article = content.decode(self.page_encoding) else: article = decoder.decode(content,url) #如果是图片,title则是mime for title, imgurl, imgfn, content, brief in readability(article,url,opts): if title.startswith(r'image/'): #图片 yield (title, imgurl, imgfn, content, brief) else: if not title: title = ftitle content = self.postprocess(content) yield (section, url, title, content, brief) else: #同步UrlFetch方式 for section, ftitle, url, desc in urls: if not desc: #非全文RSS if section != prevsection or prevsection == '': decoder.encoding = '' #每个小节都重新探测编码 prevsection = section article = self.fetcharticle(url, decoder) if not article: continue else: article = self.FragToXhtml(desc, ftitle) #如果是图片,title则是mime for title, imgurl, imgfn, content, brief in readability(article,url,opts): if title.startswith(r'image/'): #图片 yield (title, imgurl, imgfn, content, brief) else: if not title: title = ftitle content = self.postprocess(content) yield (section, url, title, content, brief)
def Items(self, opts=None): """ 生成器,返回一个元组 对于HTML:section,url,title,content,brief 对于图片,mime,url,filename,content,brief """ urls = self.ParseFeedUrls() readability = self.readability if self.fulltext_by_readability else self.readability_by_soup prevsection = '' decoder = AutoDecoder(False) if USE_ASYNC_URLFETCH: #启动异步下载 asyncopener = AsyncURLOpener(self.log) rpcs = [ asyncopener.fetch(url, self.timeout, sec, title) for sec, title, url, desc in urls if not desc ] #为了效率起见,先处理全文RSS #在处理全文RSS的时候,其他RSS在后台拼命下载中... for section, ftitle, url, desc in urls: if not desc: continue article = self.FragToXhtml(desc, ftitle) #如果是图片,title则是mime for title, imgurl, imgfn, content, brief in readability( article, url, opts): if title.startswith(r'image/'): #图片 yield (title, imgurl, imgfn, content, brief) else: if not title: title = ftitle content = self.postprocess(content) yield (section, url, title, content, brief) #轮到摘要RSS了 for result, url, (section, ftitle) in asyncopener.get_result(): if section != prevsection or prevsection == '': decoder.encoding = '' #每个小节都重新探测编码 prevsection = section status_code, content = result.status_code, result.content if status_code != 200 or not content: self.log.warn('async fetch article failed(%d):%s.' % (status_code, url)) continue if self.page_encoding: article = content.decode(self.page_encoding) else: article = decoder.decode(content, url) #如果是图片,title则是mime for title, imgurl, imgfn, content, brief in readability( article, url, opts): if title.startswith(r'image/'): #图片 yield (title, imgurl, imgfn, content, brief) else: if not title: title = ftitle content = self.postprocess(content) yield (section, url, title, content, brief) else: #同步UrlFetch方式 for section, ftitle, url, desc in urls: if not desc: #非全文RSS if section != prevsection or prevsection == '': decoder.encoding = '' #每个小节都重新探测编码 prevsection = section article = self.fetcharticle(url, decoder) if not article: continue else: article = self.FragToXhtml(desc, ftitle) #如果是图片,title则是mime for title, imgurl, imgfn, content, brief in readability( article, url, opts): if title.startswith(r'image/'): #图片 yield (title, imgurl, imgfn, content, brief) else: if not title: title = ftitle content = self.postprocess(content) yield (section, url, title, content, brief)
def Items(self, opts=None): """ 生成器,返回一个元组 对于HTML:section,url,title,content 对于图片,mime,url,filename,content """ urls = self.ParseFeedUrls() readability = self.readability if self.fulltext_by_readability else self.readability_by_soup prevsection = '' decoder = AutoDecoder() if USE_ASYNC_URLFETCH: asyncurls = [(i,url) for i,(_a,_b,url,desc) in enumerate(urls) if not desc] rpcs, i = [], 0 #先启动几个异步请求 while i < min(MAX_ASYNC_REQUESTS, len(asyncurls)): rpc = urlfetch.create_rpc(deadline=self.timeout) index,url = asyncurls.pop(0) urlfetch.make_fetch_call(rpc, url, validate_certificate=False) rpcs.append((index,rpc)) i += 1 #为了效率起见,先处理全文RSS #在处理全文RSS的时候,其他RSS在后台拼命下载中... for section, ftitle, url, desc in urls: if not desc: continue article = self.FragToXhtml(desc, ftitle) #如果是图片,title则是mime for title, imgurl, imgfn, content, brief in readability(article,url,opts): if title.startswith(r'image/'): #图片 yield (title, imgurl, imgfn, content, brief) else: if not title: title = ftitle content = self.postprocess(content) yield (section, url, title, content, brief) #轮到摘要RSS了 while True: if not rpcs: break index, rpc = rpcs.pop(0) section, ftitle, url, desc = urls[index] if section != prevsection or prevsection == '': decoder.encoding = '' #每个小节都重新探测编码 prevsection = section try: result = rpc.get_result() except urlfetch.DownloadError as e: self.log.warn(str(e)) continue except apiproxy_errors.DeadlineExceededError: self.log.warn('timeout:%s' % url) continue except Exception as e: self.log.warn('%s:%s' % (type(e), url)) continue finally: #再启动一个新的异步URL请求 if asyncurls: rpc = urlfetch.create_rpc(deadline=self.timeout) index, newurl = asyncurls.pop(0) urlfetch.make_fetch_call(rpc, newurl, validate_certificate=False) rpcs.append((index, rpc)) status_code, content = result.status_code, result.content if status_code != 200 or not content: self.log.warn('async fetch article failed(%d):%s.' % (status_code,url)) continue if self.page_encoding: article = content.decode(self.page_encoding) else: article = decoder.decode(content) #如果是图片,title则是mime for title, imgurl, imgfn, content, brief in readability(article,url,opts): if title.startswith(r'image/'): #图片 yield (title, imgurl, imgfn, content, brief) else: if not title: title = ftitle content = self.postprocess(content) yield (section, url, title, content, brief) else: #同步UrlFetch方式 for section, ftitle, url, desc in urls: if not desc: #非全文RSS if section != prevsection or prevsection == '': decoder.encoding = '' #每个小节都重新探测编码 prevsection = section article = self.fetcharticle(url, decoder) if not article: continue else: article = self.FragToXhtml(desc, ftitle) #如果是图片,title则是mime for title, imgurl, imgfn, content, brief in readability(article,url,opts): if title.startswith(r'image/'): #图片 yield (title, imgurl, imgfn, content, brief) else: if not title: title = ftitle content = self.postprocess(content) yield (section, url, title, content, brief)