Python readability 예제들, lib.readability.readability Python 예제들

예제 #1

0

파일 보기

파일: base.py 프로젝트: binbin/KindleEar

 def Items(self, opts=None, user=None):
     """
     生成器，返回一个元组
     对于HTML：section,url,title,content,brief,thumbnail
     对于图片，mime,url,filename,content,brief,thumbnail
     """
     urls = self.ParseFeedUrls()
     readability = self.readability if self.fulltext_by_readability else self.readability_by_soup
     prevsection = ''
     opener = URLOpener(self.host, timeout=self.timeout)
     decoder = AutoDecoder(False)
     for section, ftitle, url, desc in urls:
         if not desc: #非全文RSS
             if section != prevsection or prevsection == '':
                 decoder.encoding = '' #每个小节都重新检测编码
                 prevsection = section
                 opener = URLOpener(self.host, timeout=self.timeout)
                 if self.needs_subscription:
                     self.login(opener, decoder)
     
             article = self.fetcharticle(url, opener, decoder)
             if not article:
                 continue
         else:
             article = self.FragToXhtml(desc, ftitle)
         
         #如果是图片，title则是mime
         for title, imgurl, imgfn, content, brief, thumbnail in readability(article,url,opts,user,ftitle):
             if title.startswith(r'image/'): #图片
                 yield (title, imgurl, imgfn, content, brief, thumbnail)
             else:
                 if not title: title = ftitle
                 if self.force_ftitle:title = ftitle
                 content =  self.postprocess(content)
                 yield (section, url, title, content, brief, thumbnail)

예제 #2

0

파일 보기

파일: base.py 프로젝트: Iam42/KindleEar

 def Items(self, opts=None, user=None):
     """
     生成器，返回一个元组
     对于HTML：section,url,title,content,brief
     对于图片，mime,url,filename,content,brief
     """
     urls = self.ParseFeedUrls()
     readability = self.readability if self.fulltext_by_readability else self.readability_by_soup
     prevsection = ''
     decoder = AutoDecoder(False)
     for section, ftitle, url, desc in urls:
         if not desc: #非全文RSS
             if section != prevsection or prevsection == '':
                 decoder.encoding = '' #每个小节都重新探测编码
                 prevsection = section
             
             article = self.fetcharticle(url, decoder)
             if not article:
                 continue
         else:
             article = self.FragToXhtml(desc, ftitle)
         
         #如果是图片，title则是mime
         for title, imgurl, imgfn, content, brief in readability(article,url,opts,user):
             if title.startswith(r'image/'): #图片
                 yield (title, imgurl, imgfn, content, brief)
             else:
                 if not title: title = ftitle
                 content =  self.postprocess(content)
                 yield (section, url, title, content, brief)

예제 #3

0

파일 보기

    def Items(self, opts=None, user=None):
        """
        生成器，返回一个元组
        对于HTML：section,url,title,content,brief
        对于图片，mime,url,filename,content,brief
        """
        urls = self.ParseFeedUrls()
        readability = self.readability if self.fulltext_by_readability else self.readability_by_soup
        prevsection = ''
        decoder = AutoDecoder(False)
        for section, ftitle, url, desc in urls:
            if not desc:  #非全文RSS
                if section != prevsection or prevsection == '':
                    decoder.encoding = ''  #每个小节都重新探测编码
                    prevsection = section

                article = self.fetcharticle(url, decoder)
                if not article:
                    continue
            else:
                article = self.FragToXhtml(desc, ftitle)

            #如果是图片，title则是mime
            for title, imgurl, imgfn, content, brief in readability(
                    article, url, opts, user):
                if title.startswith(r'image/'):  #图片
                    yield (title, imgurl, imgfn, content, brief)
                else:
                    if not title: title = ftitle
                    content = self.postprocess(content)
                    yield (section, url, title, content, brief)

예제 #4

0

파일 보기

    def Items(self, opts=None, user=None):
        """
        生成器，返回一个元组
        对于HTML：section,url,title,content,brief,thumbnail
        对于图片，mime,url,filename,content,brief,thumbnail
        """
        urls = self.ParseFeedUrls()
        readability = self.readability if self.fulltext_by_readability else self.readability_by_soup
        prevsection = ''
        opener = URLOpener(self.host, timeout=self.timeout)
        decoder = AutoDecoder(False)
        for section, ftitle, url, desc in urls:
            if not desc:  #非全文RSS
                if section != prevsection or prevsection == '':
                    decoder.encoding = ''  #每个小节都重新检测编码
                    prevsection = section
                    opener = URLOpener(self.host, timeout=self.timeout)
                    if self.needs_subscription:
                        self.login(opener, decoder)

                article = self.fetcharticle(url, opener, decoder)
                if not article:
                    continue
            else:
                article = self.FragToXhtml(desc, ftitle)

            #如果是图片，title则是mime
            for title, imgurl, imgfn, content, brief, thumbnail in readability(
                    article, url, opts, user):
                if title.startswith(r'image/'):  #图片
                    yield (title, imgurl, imgfn, content, brief, thumbnail)
                else:
                    if user and user.use_title_in_feed:
                        title = ftitle
                    elif not title:
                        title = ftitle
                    content = self.postprocess(content)
                    yield (section, url, title, content, brief, thumbnail)

예제 #5

0

파일 보기

파일: base.py 프로젝트: adminchen/KindleEar

 def Items(self):
     """
     生成器，返回一个元组
     对于HTML：section,url,title,content
     对于图片，mime,url,filename,content
     """
     urls = self.ParseFeedUrls()
     readability = self.readability if self.fulltext_by_readability else self.readability_by_soup
     prevsection = ''
     decoder = AutoDecoder()
     if USE_ASYNC_URLFETCH:
         async = AsyncURLFetchManager()
         #对于非全文RSS文章启动异步Fetch
         rpcs = {url:async.fetch_async(url) for _a,_b,url,desc in urls if not desc}
         
         #为了效率起见，先处理全文RSS
         for section, ftitle, url, desc in urls:
             if not desc:
                 continue
             
             article = self.FragToXhtml(desc, ftitle)
             #如果是图片，title则是mime
             for title, imgurl, imgfn, content, brief in readability(article,url):
                 if title.startswith(r'image/'): #图片
                     yield (title, imgurl, imgfn, content, brief)
                 else:
                     if not title:
                         title = ftitle
                     content =  self.postprocess(content)
                     yield (section, url, title, content, brief)
         
         #再出来非全文RSS
         for section, ftitle, url, desc in urls:
             if desc:
                 continue
                 
             if section != prevsection or prevsection == '':
                 decoder.encoding = '' #每个小节都重新探测编码
                 prevsection = section
                 
             try:
                 resp = rpcs[url].get_result()
             except urlfetch.DownloadError, e:
                 self.log.warn(str(e))
                 continue
             except Exception,e:
                 self.log.warn('%s:%s.' % (str(e), url))
                 continue
             
             status_code, content = resp.status_code, resp.content
             if status_code != 200 or not content:
                 self.log.warn('async fetch article failed(%d):%s.' % (status_code,url))
                 continue
             
             if self.page_encoding:
                 article = content.decode(self.page_encoding)
             else:
                 article = decoder.decode(content)
             
             #如果是图片，title则是mime
             for title, imgurl, imgfn, content, brief in readability(article,url):
                 if title.startswith(r'image/'): #图片
                     yield (title, imgurl, imgfn, content, brief)
                 else:
                     if not title:
                         title = ftitle
                     content =  self.postprocess(content)
                     yield (section, url, title, content, brief)

예제 #6

0

파일 보기

파일: base.py 프로젝트: adminchen/KindleEar

                     yield (section, url, title, content, brief)
     else: #同步UrlFetch方式
         for section, ftitle, url, desc in urls:
             if not desc: #非全文RSS
                 if section != prevsection or prevsection == '':
                     decoder.encoding = '' #每个小节都重新探测编码
                     prevsection = section
                 
                 article = self.fetcharticle(url, decoder)
                 if not article:
                     continue
             else:
                 article = self.FragToXhtml(desc, ftitle)
             
             #如果是图片，title则是mime
             for title, imgurl, imgfn, content, brief in readability(article,url):
                 if title.startswith(r'image/'): #图片
                     yield (title, imgurl, imgfn, content, brief)
                 else:
                     if not title:
                         title = ftitle
                     content =  self.postprocess(content)
                     yield (section, url, title, content, brief)
 
 def fetcharticle(self, url, decoder):
     #使用同步方式获取一篇文章
     if self.fulltext_by_instapaper and not self.fulltext_by_readability:
         url = "http://www.instapaper.com/m?u=%s" % self.url_unescape(url)
     
     opener = URLOpener(self.host)
     result = opener.open(url)

예제 #7

0

파일 보기

파일: base.py 프로젝트: ellvea/KindleRss

 def Items(self, opts=None):
     """
     生成器，返回一个元组
     对于HTML：section,url,title,content,brief
     对于图片，mime,url,filename,content,brief
     """
     urls = self.ParseFeedUrls()
     readability = self.readability if self.fulltext_by_readability else self.readability_by_soup
     prevsection = ''
     decoder = AutoDecoder(False)
     if USE_ASYNC_URLFETCH:
         #启动异步下载
         asyncopener = AsyncURLOpener(self.log)
         rpcs = [asyncopener.fetch(url,self.timeout,sec,title) 
                 for sec,title,url,desc in urls if not desc]
         
         #为了效率起见，先处理全文RSS
         #在处理全文RSS的时候，其他RSS在后台拼命下载中...
         for section, ftitle, url, desc in urls:
             if not desc:
                 continue
             
             article = self.FragToXhtml(desc, ftitle)
             #如果是图片，title则是mime
             for title, imgurl, imgfn, content, brief in readability(article,url,opts):
                 if title.startswith(r'image/'): #图片
                     yield (title, imgurl, imgfn, content, brief)
                 else:
                     if not title: title = ftitle
                     content =  self.postprocess(content)
                     yield (section, url, title, content, brief)
         
         #轮到摘要RSS了
         for result,url,(section,ftitle) in asyncopener.get_result():
             if section != prevsection or prevsection == '':
                 decoder.encoding = '' #每个小节都重新探测编码
                 prevsection = section
                 
             status_code, content = result.status_code, result.content
             if status_code != 200 or not content:
                 self.log.warn('async fetch article failed(%d):%s.' % (status_code,url))
                 continue
             
             if self.page_encoding:
                 article = content.decode(self.page_encoding)
             else:
                 article = decoder.decode(content,url)
             
             #如果是图片，title则是mime
             for title, imgurl, imgfn, content, brief in readability(article,url,opts):
                 if title.startswith(r'image/'): #图片
                     yield (title, imgurl, imgfn, content, brief)
                 else:
                     if not title: title = ftitle
                     content = self.postprocess(content)
                     yield (section, url, title, content, brief)
     else: #同步UrlFetch方式
         for section, ftitle, url, desc in urls:
             if not desc: #非全文RSS
                 if section != prevsection or prevsection == '':
                     decoder.encoding = '' #每个小节都重新探测编码
                     prevsection = section
                 
                 article = self.fetcharticle(url, decoder)
                 if not article:
                     continue
             else:
                 article = self.FragToXhtml(desc, ftitle)
             
             #如果是图片，title则是mime
             for title, imgurl, imgfn, content, brief in readability(article,url,opts):
                 if title.startswith(r'image/'): #图片
                     yield (title, imgurl, imgfn, content, brief)
                 else:
                     if not title: title = ftitle
                     content =  self.postprocess(content)
                     yield (section, url, title, content, brief)

예제 #8

0

파일 보기

    def Items(self, opts=None):
        """
        生成器，返回一个元组
        对于HTML：section,url,title,content,brief
        对于图片，mime,url,filename,content,brief
        """
        urls = self.ParseFeedUrls()
        readability = self.readability if self.fulltext_by_readability else self.readability_by_soup
        prevsection = ''
        decoder = AutoDecoder(False)
        if USE_ASYNC_URLFETCH:
            #启动异步下载
            asyncopener = AsyncURLOpener(self.log)
            rpcs = [
                asyncopener.fetch(url, self.timeout, sec, title)
                for sec, title, url, desc in urls if not desc
            ]

            #为了效率起见，先处理全文RSS
            #在处理全文RSS的时候，其他RSS在后台拼命下载中...
            for section, ftitle, url, desc in urls:
                if not desc:
                    continue

                article = self.FragToXhtml(desc, ftitle)
                #如果是图片，title则是mime
                for title, imgurl, imgfn, content, brief in readability(
                        article, url, opts):
                    if title.startswith(r'image/'):  #图片
                        yield (title, imgurl, imgfn, content, brief)
                    else:
                        if not title: title = ftitle
                        content = self.postprocess(content)
                        yield (section, url, title, content, brief)

            #轮到摘要RSS了
            for result, url, (section, ftitle) in asyncopener.get_result():
                if section != prevsection or prevsection == '':
                    decoder.encoding = ''  #每个小节都重新探测编码
                    prevsection = section

                status_code, content = result.status_code, result.content
                if status_code != 200 or not content:
                    self.log.warn('async fetch article failed(%d):%s.' %
                                  (status_code, url))
                    continue

                if self.page_encoding:
                    article = content.decode(self.page_encoding)
                else:
                    article = decoder.decode(content, url)

                #如果是图片，title则是mime
                for title, imgurl, imgfn, content, brief in readability(
                        article, url, opts):
                    if title.startswith(r'image/'):  #图片
                        yield (title, imgurl, imgfn, content, brief)
                    else:
                        if not title: title = ftitle
                        content = self.postprocess(content)
                        yield (section, url, title, content, brief)
        else:  #同步UrlFetch方式
            for section, ftitle, url, desc in urls:
                if not desc:  #非全文RSS
                    if section != prevsection or prevsection == '':
                        decoder.encoding = ''  #每个小节都重新探测编码
                        prevsection = section

                    article = self.fetcharticle(url, decoder)
                    if not article:
                        continue
                else:
                    article = self.FragToXhtml(desc, ftitle)

                #如果是图片，title则是mime
                for title, imgurl, imgfn, content, brief in readability(
                        article, url, opts):
                    if title.startswith(r'image/'):  #图片
                        yield (title, imgurl, imgfn, content, brief)
                    else:
                        if not title: title = ftitle
                        content = self.postprocess(content)
                        yield (section, url, title, content, brief)

예제 #9

0

파일 보기

파일: base.py 프로젝트: Ja-sagen/KindleEar

 def Items(self, opts=None):
     """
     生成器，返回一个元组
     对于HTML：section,url,title,content
     对于图片，mime,url,filename,content
     """
     urls = self.ParseFeedUrls()
     readability = self.readability if self.fulltext_by_readability else self.readability_by_soup
     prevsection = ''
     decoder = AutoDecoder()
     if USE_ASYNC_URLFETCH:
         asyncurls = [(i,url) for i,(_a,_b,url,desc) in enumerate(urls) if not desc]
         rpcs, i = [], 0
         #先启动几个异步请求
         while i < min(MAX_ASYNC_REQUESTS, len(asyncurls)):
             rpc = urlfetch.create_rpc(deadline=self.timeout)
             index,url = asyncurls.pop(0)
             urlfetch.make_fetch_call(rpc, url, validate_certificate=False)
             rpcs.append((index,rpc))
             i += 1
         
         #为了效率起见，先处理全文RSS
         #在处理全文RSS的时候，其他RSS在后台拼命下载中...
         for section, ftitle, url, desc in urls:
             if not desc:
                 continue
             
             article = self.FragToXhtml(desc, ftitle)
             #如果是图片，title则是mime
             for title, imgurl, imgfn, content, brief in readability(article,url,opts):
                 if title.startswith(r'image/'): #图片
                     yield (title, imgurl, imgfn, content, brief)
                 else:
                     if not title: title = ftitle
                     content =  self.postprocess(content)
                     yield (section, url, title, content, brief)
         
         #轮到摘要RSS了
         while True:
             if not rpcs:
                 break
             index, rpc = rpcs.pop(0)
             section, ftitle, url, desc = urls[index]
             if section != prevsection or prevsection == '':
                 decoder.encoding = '' #每个小节都重新探测编码
                 prevsection = section
                 
             try:
                 result = rpc.get_result()
             except urlfetch.DownloadError as e:
                 self.log.warn(str(e))
                 continue
             except apiproxy_errors.DeadlineExceededError:
                 self.log.warn('timeout:%s' % url)
                 continue
             except Exception as e:
                 self.log.warn('%s:%s' % (type(e), url))
                 continue
             finally:
                 #再启动一个新的异步URL请求
                 if asyncurls:
                     rpc = urlfetch.create_rpc(deadline=self.timeout)
                     index, newurl = asyncurls.pop(0)
                     urlfetch.make_fetch_call(rpc, newurl, validate_certificate=False)
                     rpcs.append((index, rpc))
             
             status_code, content = result.status_code, result.content
             if status_code != 200 or not content:
                 self.log.warn('async fetch article failed(%d):%s.' % (status_code,url))
                 continue
             
             if self.page_encoding:
                 article = content.decode(self.page_encoding)
             else:
                 article = decoder.decode(content)
             
             #如果是图片，title则是mime
             for title, imgurl, imgfn, content, brief in readability(article,url,opts):
                 if title.startswith(r'image/'): #图片
                     yield (title, imgurl, imgfn, content, brief)
                 else:
                     if not title: title = ftitle
                     content = self.postprocess(content)
                     yield (section, url, title, content, brief)
     else: #同步UrlFetch方式
         for section, ftitle, url, desc in urls:
             if not desc: #非全文RSS
                 if section != prevsection or prevsection == '':
                     decoder.encoding = '' #每个小节都重新探测编码
                     prevsection = section
                 
                 article = self.fetcharticle(url, decoder)
                 if not article:
                     continue
             else:
                 article = self.FragToXhtml(desc, ftitle)
             
             #如果是图片，title则是mime
             for title, imgurl, imgfn, content, brief in readability(article,url,opts):
                 if title.startswith(r'image/'): #图片
                     yield (title, imgurl, imgfn, content, brief)
                 else:
                     if not title: title = ftitle
                     content =  self.postprocess(content)
                     yield (section, url, title, content, brief)