Exemplo n.º 1
0
    def POST(self):
        user = self.getcurrentuser(forAjax=True)
        web.header('Content-Type', 'application/json')
        webInput = web.input()
        category = webInput.get('category', '')
        title = webInput.get('title')
        feedUrl = webInput.get("url")
        isfulltext = bool(webInput.get('isfulltext', '').lower() == 'true')
        creator = webInput.get('creator', '')

        if not title or not feedUrl:
            return json.dumps({'status': _("Title or Url is empty!")})

        opener = URLOpener()
        srvUrl = urlparse.urljoin('http://kindleear.appspot.com/',
                                  SharedLibrarykindleearAppspotCom.__url__)
        data = {
            'category': category,
            'title': title,
            'url': feedUrl,
            'creator': creator,
            'isfulltext': 'true' if isfulltext else 'false',
            'key': 'kindleear.lucky!'
        }
        result = opener.open(srvUrl, data)
        if result.status_code == 200 and result.content:
            return result.content
        else:
            return json.dumps({
                'status':
                _('Cannot submit data to kindleear.appspot.com, status: %s' %
                  URLOpener.CodeMap(result.status_code))
            })
Exemplo n.º 2
0
    def ParsePageContent(self, topic, url, urls, count):
        # 请求主题页面链接并获取其内容
        result = self.GetResponseContent(url)
        # 如果请求成功,并且页面内容不为空
        if result.status_code == 200 and result.content:
            # 将页面内容转换成BeatifulSoup对象
            soup = BeautifulSoup(result.content, 'lxml')
            # 找出当前页面文章列表中所有文章条目
            items = soup.find_all(name='span', class_='tw3_01_2_t')

            # 循环处理每个文章条目
            for item in items:
                title = item.a.string  # 获取文章标题
                link = item.a.get('href')  # 获取文章链接
                link = BaseFeedBook.urljoin(url, link)  # 合成文章链接
                count += 1  # 统计当前已处理的文章条目
                # 如果处理的文章条目超过了设定数量则中止抽取
                if count > self.max_articles_per_feed:
                    break
                # 如果文章发布日期超出了设定范围则忽略不处理
                if self.OutTimeRange(item):
                    continue
                # 将符合设定文章数量和时间范围的文章信息作为元组加入列表
                urls.append((topic, title, link, None))

            # 如果主题页面有下一页,且已处理的文章条目未超过设定数量,则继续抓取下一页
            next = soup.find(name='a', string='Next')
            if next and count < self.max_articles_per_feed:
                url = BaseFeedBook.urljoin(url, next.get('href'))
                self.ParsePageContent(topic, url, urls, count)
        # 如果请求失败则打印在日志输出中
        else:
            self.log.warn('Fetch article failed(%s):%s' % \
                (URLOpener.CodeMap(result.status_code), url))
Exemplo n.º 3
0
 def ParseFeedUrls(self):
     urls = [] # 定义一个空的列表用来存放文章元组
     # 循环处理fees中两个主题页面
     for feed in self.feeds:
         # 分别获取元组中主题的名称和链接
         topic, url = feed[0], feed[1]
         # 请求主题链接并获取相应内容
         opener = URLOpener(self.host, timeout=self.timeout)
         result = opener.open(url)
         # 如果请求成功,并且页面内容不为空
         if result.status_code == 200 and result.content:
             # 将页面内容转换成BeatifulSoup对象
             soup = BeautifulSoup(result.content, 'lxml')
             # 找出当前页面文章列表中所有文章条目
             items=soup.find('div',class_='grid').find_all(name='div', class_='content')
             # 循环处理每个文章条目
             for item in items:
                 title = item.span.string # 获取文章标题
                 link = item.a.get('href') # 获取文章链接
                 link = BaseFeedBook.urljoin(url, link) # 合成文章链接
                 if self.OutTimeRange(item):
                     continue
                 urls.append((topic, title, link, None)) # 把文章元组加入列表
         # 如果请求失败通知到日志输出中
         else:
             self.log.warn('Fetch article failed(%s):%s' % \
                 (URLOpener.CodeMap(result.status_code), url))
     # 返回提取到的所有文章列表
     return urls
Exemplo n.º 4
0
    def GetNewComic(self):
        urls = []

        if not self.feeds:
            return []

        userName = self.UserName()
        decoder = AutoDecoder(isfeed=False)
        for item in self.feeds:
            title, url = item[0], item[1]

            lastCount = LastDelivered.all().filter(
                'username = '******'These is no log in db LastDelivered for name: %s, set to 0'
                    % title)
                oldNum = 0
            else:
                oldNum = lastCount.num

            opener = URLOpener(self.host, timeout=60)
            result = opener.open(url)
            if result.status_code != 200:
                self.log.warn(
                    'fetch index page for %s failed[%s] : %s' %
                    (title, URLOpener.CodeMap(result.status_code), url))
                continue
            content = result.content
            content = self.AutoDecodeContent(content, decoder,
                                             self.feed_encoding,
                                             opener.realurl, result.headers)

            soup = BeautifulSoup(content, 'lxml')

            allComicTable = soup.find_all('table', {'width': '688'})
            addedForThisComic = False
            for comicTable in allComicTable:
                comicVolumes = comicTable.find_all('a', {'target': '_blank'})
                for volume in comicVolumes:
                    texts = volume.text.split(' ')
                    if len(texts) > 2 and texts[1].isdigit() and volume.get(
                            'href'):
                        num = int(texts[1])
                        if num > oldNum:
                            oldNum = num
                            href = self.urljoin(self.host, volume.get('href'))
                            urls.append((title, num, href))
                            addedForThisComic = True
                            break  #一次只推送一卷(有时候一卷已经很多图片了)

                if addedForThisComic:
                    break

        return urls
Exemplo n.º 5
0
    def ParseFeedUrls(self):
        urls = []  # 定义一个空的列表用来存放文章元组
        # 循环处理fees中两个主题页面
        for feed in self.feeds:
            # 分别获取元组中主题的名称和链接
            topic, url = feed[0], feed[1]
            # 请求主题链接并获取相应内容
            opener = URLOpener(self.host, timeout=self.timeout)
            result = opener.open(url)
            # 如果请求成功,并且页面内容不为空
            if result.status_code == 200 and result.content:
                # 将页面内容转换成BeatifulSoup对象
                soup = BeautifulSoup(result.content, 'html.parser')
                # 找出当前页面文章列表中所有文章条目'
                sections = soup.find_all(name='div', class_='column-news')
                # self.log.warn('find %d sections' % len(sections))
                for section in sections:
                    tag = section.find(name='ul', class_='column-title')
                    sectionName = tag.a.li.string
                    tuwens = section.find_all(
                        name='div', class_=re.compile("tuwen-block-"))
                    # self.log.warn('%s find %d tuwen' % (sectionName, len(tuwens)))
                    for tuwen in tuwens:
                        articles = tuwen.find_all('a')
                        title = ''
                        link = ''
                        for article in articles:
                            if not article.img:
                                title = article.string
                                link = article.get('href')  # 获取文章链接
                                self.log.warn('title : %s, link: %s' %
                                              (title, link))
                                break
                        urls.append(
                            (sectionName, title, link, None))  # 把文章元组加入列表
                    texts = section.find_all(name='li',
                                             class_=re.compile("list-text-"))
                    # self.log.warn('%s find %d texts' % (sectionName, len(texts)))
                    for text in texts:
                        title = text.a.string
                        link = text.a.get('href')  # 获取文章链接
                        self.log.warn('title : %s, link: %s' % (title, link))
                        urls.append(
                            (sectionName, title, link, None))  # 把文章元组加入列表

            # 如果请求失败通知到日志输出中
            else:
                self.log.warn('Fetch article failed(%s):%s' %
                              (URLOpener.CodeMap(result.status_code), url))
        # 返回提取到的所有文章列表
        return urls
    def ParsePageContent(self, topic, url, urls, count):
        # 请求主题页面链接并获取其内容
        result = self.GetResponseContent(url)
        # 如果请求成功,并且页面内容不为空
        if result.status_code == 200 and result.content:
            # 将页面内容转换成BeatifulSoup对象
            soup = BeautifulSoup(result.content, 'lxml')
            # 找出当前页面文章列表中所有文章条目
            items = soup.find_all(name='div', class_='col-md-12 border')
            #topics = soup.find_all(name='small', class_='text-muted')

            # 循环处理每个文章条目
            for item in items:
                title = item.div.h4.em.string  # 获取文章标题
                link = item.a.get('onclick').split("'")  # 获取文章链接
                link = 'https://trends.lenovoresearch.cn/tst/article/article-detail/' + "?article_id=" + link[
                    1] + "&sections=" + link[3] + "&dates=" + link[
                        5] + "&sort=" + link[7] + "&search=" + link[
                            9] + "&page=" + link[11] + "&web_source=" + topic
                #self.log.warn(item.find_all(name='em')[1].string)
                group = item.find_all(name='em')[1].string
                #topic = topics[2*count - 1].string
                #如果处理的文章条目超过了设定数量则中止抽取
                if count > self.max_articles_per_feed:
                    break
                # 如果文章发布日期超出了设定范围则忽略不处理
                if self.OutTimeRange(item):
                    break
#如果文章发布期刊超出了设定则忽略不处理
                if self.OutIssue(item):
                    #self.log.warn(self.issue_number)
                    continue
                count += 1  # 统计当前已处理的文章条目
                # 将符合设定文章数量和时间范围的文章信息作为元组加入列表
                urls.append((group, title, link, None))

            # 如果主题页面有下一页,且已处理的文章条目未超过设定数量,则继续抓取下一页
            next = soup.find_all(name='li', class_='page-item')
            #self.log.warn(next)
            if next[-1].span and count < self.max_articles_per_feed:
                #self.log.warn(temp)
                link = next[-1].a.get("href").replace(" ", "%20")
                links = 'https://trends.lenovoresearch.cn/tst/article/article-list/' + link
                #self.log.warn(links)
                self.ParsePageContent(topic, links, urls, count)
        # 如果请求失败则打印在日志输出中
        else:
            self.log.warn('Fetch article failed(%s):%s' % \
                (URLOpener.CodeMap(result.status_code), url))
Exemplo n.º 7
0
    def ParseFeedUrls(self):
        urls = []
        userName = self.UserName()
        decoder = AutoDecoder(isfeed=False)

        lastCount = LastDelivered.all().filter('username = '******''
        else:
            oldNum = lastCount.num
            oldChapterTitle = lastCount.record

        opener = URLOpener(self.host, timeout=60)
        result = opener.open(self.feeds)
        if result.status_code != 200:
            self.log.warn('fetch index page for %s failed[%s] : %s' %
                          (self.title, URLOpener.CodeMap(
                              result.status_code), self.feeds))
            return []

        # 从页面获取章节列表
        content = self.AutoDecodeContent(result.content, decoder,
                                         self.feed_encoding, opener.realurl,
                                         result.headers)
        soup = BeautifulSoup(content, 'lxml')
        chapterList = self.GetChapterList(soup)

        chapterNum = 0
        for chapter in chapterList:
            if chapterNum >= self.limit:
                break
            url = chapter.get('href')
            num = self.GetChapterNum(url)
            if num > oldNum:
                oldNum = num
                oldChapterTitle = chapter.text
                chapterNum += 1
                urls.append(
                    (self.title, oldChapterTitle, self.urljoin(self.host,
                                                               url), ''))

        self.UpdateLastDelivered(self.title, oldNum, oldChapterTitle)
        return urls
Exemplo n.º 8
0
    def fetch(self, url, opener, decoder):
        """链接网络,下载网页并解码"""
        result = opener.open(url)
        status_code, content = result.status_code, result.content
        if status_code not in (200, 206) or not content:
            self.log.warn('fetch page failed(%s):%s.' %
                          (URLOpener.CodeMap(status_code), url))
            return None

        #debug_mail(content)

        if self.page_encoding:
            try:
                return content.decode(self.page_encoding)
            except UnicodeDecodeError:
                return decoder.decode(content, opener.realurl, result.headers)
        else:
            return decoder.decode(content, opener.realurl, result.headers)
Exemplo n.º 9
0
    def GET(self):
        user = self.getcurrentuser(forAjax=True)
        web.header('Content-Type', 'application/json')

        #连接分享服务器获取数据
        respDict = {'status': 'ok', 'categories': []}

        opener = URLOpener()
        url = urlparse.urljoin(
            'http://kindleear.appspot.com/',
            SharedLibraryCategorykindleearAppspotCom.__url__)
        result = opener.open(url + '?key=kindleear.lucky!')

        if result.status_code == 200 and result.content:
            respDict['categories'] = json.loads(result.content)
        else:
            respDict['status'] = _(
                'Cannot fetch data from kindleear.appspot.com, status: '
            ) + URLOpener.CodeMap(result.status_code)

        return json.dumps(respDict)
Exemplo n.º 10
0
    def ParsePageLinks(self, topic, url, urls, count, count2, ccc):
        # 请求主题页面或章节列表页面的链接和内容
        result = self.GetResponseContent(url)
        # 如果请求成功,并且页面内容不为空
        if result.status_code == 200 and result.content:
            # 将主题或列表页面内容转换成BeatifulSoup对象
            soup = BeautifulSoup(result.content, 'lxml')
            # 找出当前页面文章列表中所有文章条目,里面的标签参数需要手工修改确认
            items = soup.find_all(name='dd')
            #获取总章节数,以便抓取最新章节,追更---应该有个函数能使用,可惜我不知道啊
            for ttt in items:
                ccc += 1

            # 循环处理每个文章条目
            for item in items:
                title = item.a.string  # 获取文章标题
                link = item.a.get('href')  # 获取文章链接
                link = BaseFeedBook.urljoin(url, link)  # 合成文章链接
                count += 1  # 统计当前已处理的文章条目

                # 如果处理的文章条目超过了设定数量则中止抽取,改动下面的条件限制,选择抓取方式,都屏蔽掉,则抓全部
                count2 = count + self.max_articles_per_feed
                if count2 < ccc:  #一、从最后抓n章
                    continue

                #if count > self.max_articles_per_feed:                                      #二、从前面抓n章
                #    break

                # 将符合设定文章数量的文章信息作为元组加入列表
                urls.append((topic, title, link, None))

            # 如果主题页面有下一页,且已处理的文章条目未超过设定数量,则继续抓取下一页,递进调用自己
            #next = soup.find(name='a', string='Next')
            #if next and count < self.max_articles_per_feed:
            #url = BaseFeedBook.urljoin(url, next.get('href'))
            #self.ParsePageLinks(topic, url, urls, count)
        # 如果请求失败则打印在日志输出中
        else:
            self.log.warn('Fetch article failed(%s):%s' % \
                (URLOpener.CodeMap(result.status_code), url))
Exemplo n.º 11
0
    def GET(self):
        user = self.getcurrentuser()

        #连接分享服务器获取数据
        shared_data = []
        tips = ''
        opener = URLOpener()
        url = urlparse.urljoin('http://kindleear.appspot.com/',
                               SharedLibrarykindleearAppspotCom.__url__)
        result = opener.open(url + '?key=kindleear.lucky!')
        if result.status_code == 200 and result.content:
            shared_data = json.loads(result.content)
        else:
            tips = _('Cannot fetch data from kindleear.appspot.com, status: '
                     ) + URLOpener.CodeMap(result.status_code)

        return self.render('sharedlibrary.html',
                           "Shared",
                           current='shared',
                           user=user,
                           shared_data=shared_data,
                           tips=tips)
Exemplo n.º 12
0
 def ParseFeedUrls(self):
     urls = [] # 定义一个空的列表用来存放文章元组
     # 循环处理fees中两个主题页面
     for feed in self.feeds:
         # 分别获取元组中主题的名称和链接
         topic, url = feed[0], feed[1]
         # 请求主题链接并获取相应内容
         opener = URLOpener(self.host, timeout=self.timeout)
         result = opener.open(url)
         # 如果请求成功,并且页面内容不为空
         if result.status_code == 200 and result.content:
             # 将页面内容转换成BeatifulSoup对象
             soup = BeautifulSoup(result.content, 'html.parser')
             # self.log.warn('title : %s' % soup.title)
             # 找出当前页面文章列表中所有文章条目'
             items = soup.find_all(name='div', class_="content")
             self.log.warn('find : %d articles.' % len(items))
             # 循环处理每个文章条目
             count = 0
             for item in items:
                 title = item.a.string # 获取文章标题
                 link = item.a.get('href') # 获取文章链接
                 link = BaseFeedBook.urljoin("https://toutiao.io", link) # 合成文章链接
                 link = self.getRealUrl (link)
                 self.log.warn('Fetch article : %s' % link)
                 if string.find (link, 'zhihu.com') != -1:
                     link = self.url4forwarder(url)
                     self.log.warn('transport : %s' % link)                        
                 urls.append((topic, title, link, None)) # 把文章元组加入列表
                 count = count + 1
                 if count >= 30 :
                     break
         # 如果请求失败通知到日志输出中
         else:
             self.log.warn('Fetch article failed(%s):%s' % \
                 (URLOpener.CodeMap(result.status_code), url))
     # 返回提取到的所有文章列表
     return urls
Exemplo n.º 13
0
 def ParseFeedUrls(self):
     urls = []  # 定义一个空的列表用来存放文章元组
     # 循环处理fees中两个主题页面
     for feed in self.feeds:
         # 分别获取元组中主题的名称和链接
         topic, url = feed[0], feed[1]
         # 请求主题链接并获取相应内容
         opener = URLOpener(self.host, timeout=self.timeout)
         result = opener.open(url)
         # 如果请求成功,并且页面内容不为空
         if result.status_code == 200 and result.content:
             # 将页面内容转换成BeatifulSoup对象
             soup = BeautifulSoup(result.content, 'html.parser')
             # 找出当前页面文章列表中所有文章条目'
             item = soup.find(name='dd')
             count = 0
             while item:
                 # 只获取最新更新章节
                 if item.name != 'dd':
                     break
                 title = item.a.string  # 获取文章标题
                 link = item.a.get('href')  # 获取文章链接
                 link = BaseFeedBook.urljoin(
                     "https://www.72wx.com", link)  # 合成文章链接
                 urls.insert(0, (topic, title, link, None))  # 把文章元组加入列表
                 count = count + 1
                 if count >= 20:
                     break
                 item = item.next_sibling
                 while type(item) != element.Tag:
                     item = item.next_sibling
         # 如果请求失败通知到日志输出中
         else:
             self.log.warn('Fetch article failed(%s):%s' %
                           (URLOpener.CodeMap(result.status_code), url))
     # 返回提取到的所有文章列表
     return urls
Exemplo n.º 14
0
    def ParseFeedUrls(self):
        """ return list like [(section,title,url,desc),..] """
        urls = []
        urladded = set()
        url = self.url4forwarder(self.feeds[0][1])
        opener = URLOpener(self.host, timeout=self.timeout)
        result = opener.open(url)
        if result.status_code == 200 and result.content:
            feed = json.loads(result.content.decode(self.feed_encoding))

            for partition, section in self.partitions:
                for item in feed[partition]:
                    urlfeed = item['share_url']
                    if urlfeed in urladded:
                        self.log.info('duplicated, skipped %s' % urlfeed)
                        continue

                    urls.append((section, item['title'],
                                 self.url4forwarder(urlfeed), None))
                    urladded.add(urlfeed)
        else:
            self.log.warn('fetch rss failed(%s):%s' %
                          (URLOpener.CodeMap(result.status_code), url))
        return urls
Exemplo n.º 15
0
    def POST(self, mgrType):
        user = self.getcurrentuser(forAjax=True)
        if mgrType == 'reportinvalid':  #报告一个源失效了
            web.header('Content-Type', 'application/json')
            title = web.input().get('title', '')
            feedUrl = web.input().get('url', '')

            opener = URLOpener()
            path = SharedLibraryMgrkindleearAppspotCom.__url__.split('/')
            path[-1] = mgrType
            srvUrl = urlparse.urljoin('http://kindleear.appspot.com/',
                                      '/'.join(path))
            data = {'title': title, 'url': feedUrl, 'key': 'kindleear.lucky!'}
            result = opener.open(srvUrl, data)
            if result.status_code == 200 and result.content:
                return result.content
            else:
                return json.dumps({
                    'status':
                    _('Cannot fetch data from kindleear.appspot.com, status: ')
                    + URLOpener.CodeMap(result.status_code)
                })
        else:
            return json.dumps({'status': 'unknown command: %s' % mgrType})
Exemplo n.º 16
0
    def ParseFeedUrls(self):
        """ return list like [(section,title,url,desc),..] """
        urls = []
        tnow = datetime.datetime.utcnow()
        urladded = set()

        for feed in self.feeds:
            section, url = feed[0], feed[1]
            isfulltext = feed[2] if len(feed) > 2 else False
            timeout = self.timeout+10 if isfulltext else self.timeout
            opener = URLOpener(self.host, timeout=timeout)

            id = urlparse.urlparse(url).query.split('=')[1]

            result = opener.open(url)
            if result.status_code == 200 and result.content:
                if self.feed_encoding:
                    try:
                        content = result.content.decode(self.feed_encoding)
                    except UnicodeDecodeError:
                        content = AutoDecoder(True).decode(result.content,opener.realurl,result.headers)
                else:
                    content = AutoDecoder(True).decode(result.content,opener.realurl,result.headers)
            else:
                self.log.warn('fetch rss failed(%s):%s' % (URLOpener.CodeMap(result.status_code), url))
                continue
            
            eqs, ekv = process_eqs(content)
            url = WEIXIN_URL.format(id=id, eqs=urllib.quote(eqs), ekv=ekv, t=int(time.time()*1000))

            result = opener.open(url)
            if result.status_code == 200 and result.content:
                if self.feed_encoding:
                    try:
                        content = result.content.decode(self.feed_encoding)
                    except UnicodeDecodeError:
                        content = AutoDecoder(True).decode(result.content,opener.realurl,result.headers)
                else:
                    content = AutoDecoder(True).decode(result.content,opener.realurl,result.headers)
                content = content[content.find('{'):content.rfind('}')+1]
                try:
                    content = json.loads(content)
                except ValueError:
                    continue

                for e in content['items'][:self.max_articles_per_feed]:
                    e = feedparser.parse(e)['entries'][0]
                    updated = None
                    if hasattr(e, 'lastmodified') and e.lastmodified:
                        updated = float(e.lastmodified)

                    if self.oldest_article > 0 and updated:
                        updated = datetime.datetime.utcfromtimestamp(updated)
                        delta = tnow - updated
                        if self.oldest_article > 365:
                            threshold = self.oldest_article #以秒为单位
                        else:
                            threshold = 86400*self.oldest_article #以天为单位

                        if delta.days*86400+delta.seconds > threshold:
                            self.log.info("Skip old article(%s): %s" % (updated.strftime('%Y-%m-%d %H:%M:%S'),e.href))
                            continue

                    #支持HTTPS
                    if hasattr(e, 'href'):
                        if url.startswith('https://'):
                            urlfeed = e.href.replace('http://','https://')
                        else:
                            urlfeed = e.href

                        if urlfeed in urladded:
                            continue
                    else:
                        urlfeed = ''

                    desc = None
                    urls.append((section, e.title, urlfeed, desc))
                    urladded.add(urlfeed)
            else:
                self.log.warn('fetch rss failed(%s):%s' % (URLOpener.CodeMap(result.status_code), url))

        return urls
Exemplo n.º 17
0
    def readability_by_soup(self, article, url, opts=None, user=None):
        """ 使用BeautifulSoup手动解析网页,提取正文内容
        因为图片文件占内存,为了节省内存,这个函数也做为生成器
        """
        content = self.preprocess(article)
        soup = BeautifulSoup(content, "lxml")

        try:
            title = soup.html.head.title.string
        except AttributeError:
            self.log.warn('object soup invalid!(%s)' % url)
            return
        if not title:
            self.log.warn('article has no title.[%s]' % url)
            return

        title = self.processtitle(title)
        soup.html.head.title.string = title

        if self.keep_only_tags:
            body = soup.new_tag('body')
            try:
                if isinstance(self.keep_only_tags, dict):
                    keep_only_tags = [self.keep_only_tags]
                else:
                    keep_only_tags = self.keep_only_tags
                for spec in keep_only_tags:
                    for tag in soup.find('body').find_all(**spec):
                        body.insert(len(body.contents), tag)
                soup.find('body').replace_with(body)
            except AttributeError:  # soup has no body element
                pass

        for spec in self.remove_tags_after:
            tag = soup.find(**spec)
            remove_beyond(tag, 'next_sibling')

        for spec in self.remove_tags_before:
            tag = soup.find(**spec)
            remove_beyond(tag, 'previous_sibling')

        remove_tags = self.insta_remove_tags + self.remove_tags
        remove_ids = self.insta_remove_ids + self.remove_ids
        remove_classes = self.insta_remove_classes + self.remove_classes
        remove_attrs = self.insta_remove_attrs + self.remove_attrs

        for tag in soup.find_all(remove_tags):
            tag.decompose()
        for id in remove_ids:
            for tag in soup.find_all(attrs={"id": id}):
                tag.decompose()
        for cls in remove_classes:
            for tag in soup.find_all(attrs={"class": cls}):
                tag.decompose()
        for attr in remove_attrs:
            for tag in soup.find_all(attrs={attr: True}):
                del tag[attr]
        for cmt in soup.find_all(text=lambda text: isinstance(text, Comment)):
            cmt.extract()

        if self.extra_css:
            sty = soup.new_tag('style', type="text/css")
            sty.string = self.extra_css
            soup.html.head.append(sty)

        self.soupbeforeimage(soup)

        has_imgs = False
        thumbnail = None

        if self.keep_image:
            opener = URLOpener(self.host, timeout=self.timeout)
            for img in soup.find_all('img'):
                #现在使用延迟加载图片技术的网站越来越多了,这里处理一下
                #注意:如果data-src之类的属性保存的不是真实url就没辙了
                imgurl = img['src'] if 'src' in img.attrs else ''
                if not imgurl:
                    for attr in img.attrs:
                        if attr != 'src' and 'src' in attr:  #很多网站使用data-src
                            imgurl = img[attr]
                            break
                if not imgurl:
                    img.decompose()
                    continue
                if not imgurl.startswith('data:'):
                    if not imgurl.startswith('http'):
                        imgurl = self.urljoin(url, imgurl)
                    if self.fetch_img_via_ssl and url.startswith('https://'):
                        imgurl = imgurl.replace('http://', 'https://')
                    if self.isfiltered(imgurl):
                        self.log.warn('img filtered:%s' % imgurl)
                        img.decompose()
                        continue
                imgresult = opener.open(imgurl)
                imgcontent = self.process_image(
                    imgresult.content,
                    opts) if imgresult.status_code == 200 else None
                if imgcontent:
                    if len(imgcontent
                           ) < self.img_min_size:  #rexdf too small image
                        img.decompose()
                        continue

                    imgtype = imghdr.what(None, imgcontent)
                    if imgtype:
                        imgmime = r"image/" + imgtype
                        fnimg = "img%d.%s" % (self.imgindex, 'jpg' if imgtype
                                              == 'jpeg' else imgtype)
                        img['src'] = fnimg

                        #使用第一个图片做为目录缩略图
                        if not has_imgs:
                            has_imgs = True
                            thumbnail = imgurl
                            yield (imgmime, imgurl, fnimg, imgcontent, None,
                                   True)
                        else:
                            yield (imgmime, imgurl, fnimg, imgcontent, None,
                                   None)
                    else:
                        img.decompose()
                else:
                    self.log.warn(
                        'fetch img failed(%s):%s' %
                        (URLOpener.CodeMap(imgresult.status_code), imgurl))
                    img.decompose()

            #去掉图像上面的链接,以免误触后打开浏览器
            for img in soup.find_all('img'):
                if img.parent and img.parent.parent and \
                    img.parent.name == 'a':
                    img.parent.replace_with(img)
        else:
            for img in soup.find_all('img'):
                img.decompose()

        #如果没有内容标题则添加
        body = soup.html.body
        t = body.find(['h1', 'h2'])
        if not t:
            t = soup.new_tag('h2')
            t.string = title
            body.insert(0, t)
        else:
            totallen = 0
            for ps in t.previous_siblings:
                totallen += len(string_of_tag(ps))
                if totallen > 40:  #此H1/H2在文章中间出现,不是文章标题
                    t = soup.new_tag('h2')
                    t.string = title
                    body.insert(0, t)
                    break

        #删除body的所有属性,以便InsertToc使用正则表达式匹配<body>
        bodyattrs = [attr for attr in body.attrs]
        for attr in bodyattrs:
            del body[attr]

        #将HTML5标签转换为div
        for x in soup.find_all([
                'article', 'aside', 'header', 'footer', 'nav', 'figcaption',
                'figure', 'section', 'time'
        ]):
            x.name = 'div'

        self.soupprocessex(soup)

        #插入分享链接
        if user:
            self.AppendShareLinksToArticle(soup, user, url)

        content = unicode(soup)

        #提取文章内容的前面一部分做为摘要
        brief = u''
        if GENERATE_TOC_DESC:
            for h in body.find_all(['h1', 'h2']):  # 去掉h1/h2,避免和标题重复
                h.decompose()
            for s in body.stripped_strings:
                brief += unicode(s) + u' '
                if len(brief) >= TOC_DESC_WORD_LIMIT:
                    brief = brief[:TOC_DESC_WORD_LIMIT]
                    break
        soup = None

        yield (title, None, None, content, brief, thumbnail)
Exemplo n.º 18
0
    def readability(self, article, url, opts=None, user=None):
        """ 使用readability-lxml处理全文信息
        因为图片文件占内存,为了节省内存,这个函数也做为生成器
        """
        content = self.preprocess(article)
        if not content:
            return

        # 提取正文
        try:
            doc = readability.Document(content,
                                       positive_keywords=self.positive_classes)
            summary = doc.summary(html_partial=False)
        except:
            # 如果提取正文出错,可能是图片(一个图片做为一篇文章,没有使用html包装)
            imgtype = imghdr.what(None, content)
            if imgtype:  #如果是图片,则使用一个简单的html做为容器
                imgmime = r"image/" + imgtype
                fnimg = "img%d.%s" % (self.imgindex,
                                      'jpg' if imgtype == 'jpeg' else imgtype)
                yield (imgmime, url, fnimg, content, None, None)
                tmphtml = '<html><head><title>Picture</title></head><body><img src="%s" /></body></html>' % fnimg
                yield ('Picture', None, None, tmphtml, '', None)
            else:
                self.log.warn('article is invalid.[%s]' % url)
            return

        title = doc.short_title()
        if not title:
            self.log.warn('article has no title.[%s]' % url)
            return

        title = self.processtitle(title)

        soup = BeautifulSoup(summary, "lxml")

        #如果readability解析失败,则启用备用算法(不够好,但有全天候适应能力)
        body = soup.find('body')
        head = soup.find('head')
        if len(body.contents) == 0:
            from simpleextract import simple_extract
            summary = simple_extract(content)
            soup = BeautifulSoup(summary, "lxml")
            body = soup.find('body')
            if not body:
                self.log.warn('extract article content failed.[%s]' % url)
                return

            head = soup.find('head')
            #增加备用算法提示,提取效果不好不要找我,类似免责声明:)
            info = soup.new_tag(
                'p', style='color:#555555;font-size:60%;text-align:right;')
            info.string = 'extracted by alternative algorithm.'
            body.append(info)

            self.log.info('use alternative algorithm to extract content.')

        if not head:
            head = soup.new_tag('head')
            soup.html.insert(0, head)

        if not head.find('title'):
            t = soup.new_tag('title')
            t.string = title
            head.append(t)

        #如果没有内容标题则添加
        t = body.find(['h1', 'h2'])
        if not t:
            t = soup.new_tag('h2')
            t.string = title
            body.insert(0, t)
        else:
            totallen = 0
            for ps in t.previous_siblings:
                totallen += len(string_of_tag(ps))
                if totallen > 40:  #此H1/H2在文章中间出现,不是文章标题
                    t = soup.new_tag('h2')
                    t.string = title
                    body.insert(0, t)
                    break

        if self.remove_tags:
            for tag in soup.find_all(self.remove_tags):
                tag.decompose()
        for id in self.remove_ids:
            for tag in soup.find_all(attrs={"id": id}):
                tag.decompose()
        for cls in self.remove_classes:
            for tag in soup.find_all(attrs={"class": cls}):
                tag.decompose()
        for attr in self.remove_attrs:
            for tag in soup.find_all(attrs={attr: True}):
                del tag[attr]
        for cmt in soup.find_all(text=lambda text: isinstance(text, Comment)):
            cmt.extract()

        #删除body的所有属性,以便InsertToc使用正则表达式匹配<body>
        bodyattrs = [attr for attr in body.attrs]
        for attr in bodyattrs:
            del body[attr]

        if self.extra_css:
            sty = soup.new_tag('style', type="text/css")
            sty.string = self.extra_css
            soup.html.head.append(sty)

        self.soupbeforeimage(soup)

        has_imgs = False
        thumbnail = None

        if self.keep_image:
            opener = URLOpener(self.host, timeout=self.timeout)
            for img in soup.find_all('img'):
                #现在使用延迟加载图片技术的网站越来越多了,这里处理一下
                #注意:如果data-src之类的属性保存的不是真实url就没辙了
                imgurl = img['src'] if 'src' in img.attrs else ''
                if not imgurl:
                    for attr in img.attrs:
                        if attr != 'src' and 'src' in attr:  #很多网站使用data-src
                            imgurl = img[attr]
                            break
                if not imgurl:
                    img.decompose()
                    continue
                if not imgurl.startswith('data:'):
                    if not imgurl.startswith('http'):
                        imgurl = self.urljoin(url, imgurl)
                    if self.fetch_img_via_ssl and url.startswith('https://'):
                        imgurl = imgurl.replace('http://', 'https://')
                    if self.isfiltered(imgurl):
                        self.log.warn('img filtered : %s' % imgurl)
                        img.decompose()
                        continue
                imgresult = opener.open(imgurl)
                imgcontent = self.process_image(
                    imgresult.content,
                    opts) if imgresult.status_code == 200 else None
                if imgcontent:
                    if len(imgcontent
                           ) < self.img_min_size:  #rexdf too small image
                        img.decompose()
                        continue

                    imgtype = imghdr.what(None, imgcontent)
                    if imgtype:
                        imgmime = r"image/" + imgtype
                        fnimg = "img%d.%s" % (self.imgindex, 'jpg' if imgtype
                                              == 'jpeg' else imgtype)
                        img['src'] = fnimg

                        #使用第一个图片做为目录缩略图
                        if not has_imgs:
                            has_imgs = True
                            thumbnail = imgurl
                            yield (imgmime, imgurl, fnimg, imgcontent, None,
                                   True)
                        else:
                            yield (imgmime, imgurl, fnimg, imgcontent, None,
                                   None)
                    else:
                        img.decompose()
                else:
                    self.log.warn(
                        'fetch img failed(%s):%s' %
                        (URLOpener.CodeMap(imgresult.status_code), imgurl))
                    img.decompose()

            #去掉图像上面的链接,以免误触后打开浏览器
            for img in soup.find_all('img'):
                if img.parent and img.parent.parent and \
                    img.parent.name == 'a':
                    img.parent.replace_with(img)
        else:
            for img in soup.find_all('img'):
                img.decompose()

        #将HTML5标签转换为div
        for x in soup.find_all([
                'article', 'aside', 'header', 'footer', 'nav', 'figcaption',
                'figure', 'section', 'time'
        ]):
            x.name = 'div'

        self.soupprocessex(soup)

        #插入分享链接
        if user:
            self.AppendShareLinksToArticle(soup, user, url)

        content = unicode(soup)

        #提取文章内容的前面一部分做为摘要
        brief = u''
        if GENERATE_TOC_DESC:
            for h in body.find_all(['h1', 'h2']):  # 去掉h1/h2,避免和标题重复
                h.decompose()
            for s in body.stripped_strings:
                brief += unicode(s) + u' '
                if len(brief) >= TOC_DESC_WORD_LIMIT:
                    brief = brief[:TOC_DESC_WORD_LIMIT]
                    break
        soup = None

        yield (title, None, None, content, brief, thumbnail)
Exemplo n.º 19
0
    def ParseFeedUrls(self):
        """ return list like [(section,title,url,desc),..] """
        urls = []
        tnow = datetime.datetime.utcnow()
        urladded = set()

        for feed in self.feeds:
            section, url = feed[0], feed[1]
            isfulltext = feed[2] if len(feed) > 2 else False
            timeout = self.timeout + 10 if isfulltext else self.timeout
            opener = URLOpener(self.host, timeout=timeout)
            result = opener.open(url)
            if result.status_code == 200 and result.content:
                #debug_mail(result.content, 'feed.xml')

                if self.feed_encoding:
                    try:
                        content = result.content.decode(self.feed_encoding)
                    except UnicodeDecodeError:
                        content = AutoDecoder(True).decode(
                            result.content, opener.realurl, result.headers)
                else:
                    content = AutoDecoder(True).decode(result.content,
                                                       opener.realurl,
                                                       result.headers)
                feed = feedparser.parse(content)

                for e in feed['entries'][:self.max_articles_per_feed]:
                    updated = None
                    if hasattr(e, 'updated_parsed') and e.updated_parsed:
                        updated = e.updated_parsed
                    elif hasattr(e, 'published_parsed') and e.published_parsed:
                        updated = e.published_parsed
                    elif hasattr(e, 'created_parsed'):
                        updated = e.created_parsed

                    if self.oldest_article > 0 and updated:
                        updated = datetime.datetime(*(updated[0:6]))
                        delta = tnow - updated
                        if self.oldest_article > 365:
                            threshold = self.oldest_article  #以秒为单位
                        else:
                            threshold = 86400 * self.oldest_article  #以天为单位

                        if delta.days * 86400 + delta.seconds > threshold:
                            self.log.info(
                                "Skip old article(%s): %s" %
                                (updated.strftime('%Y-%m-%d %H:%M:%S'),
                                 e.link))
                            continue

                    #支持HTTPS
                    if hasattr(e, 'link'):
                        if url.startswith('https://'):
                            urlfeed = e.link.replace('http://', 'https://')
                        else:
                            urlfeed = e.link

                        if urlfeed in urladded:
                            continue
                    else:
                        urlfeed = ''

                    desc = None
                    if isfulltext:
                        summary = e.summary if hasattr(e, 'summary') else None
                        desc = e.content[0]['value'] if (hasattr(
                            e, 'content') and e.content[0]['value']) else None

                        #同时存在,因为有的RSS全文内容放在summary,有的放在content
                        #所以认为内容多的为全文
                        if summary and desc:
                            desc = summary if len(summary) > len(
                                desc) else desc
                        elif summary:
                            desc = summary

                        if not desc:
                            if not urlfeed:
                                continue
                            else:
                                self.log.warn(
                                    'fulltext feed item no has desc,link to webpage for article.(%s)'
                                    % e.title)
                    urls.append((section, e.title, urlfeed, desc))
                    urladded.add(urlfeed)
            else:
                self.log.warn('fetch rss failed(%s):%s' %
                              (URLOpener.CodeMap(result.status_code), url))

        return urls
Exemplo n.º 20
0
    def Items(self, opts=None, user=None):
        """
        生成器,返回一个元组
        对于HTML:section,url,title,content,brief,thumbnail
        对于图片,mime,url,filename,content,brief,thumbnail
        如果是图片,仅第一个图片的thumbnail返回True,其余为None
        """
        decoder = AutoDecoder(False)
        timeout = self.timeout
        for section, url in self.feeds:
            opener = URLOpener(self.host, timeout=timeout)
            result = opener.open(url)
            status_code, content = result.status_code, result.content
            if status_code != 200 or not content:
                self.log.warn('fetch article failed(%s):%s.' %
                              (URLOpener.CodeMap(status_code), url))
                continue

            if self.page_encoding:
                try:
                    content = content.decode(self.page_encoding)
                except UnicodeDecodeError:
                    content = decoder.decode(content, opener.realurl,
                                             result.headers)
            else:
                content = decoder.decode(content, opener.realurl,
                                         result.headers)

            content = self.preprocess(content)
            soup = BeautifulSoup(content, "lxml")

            head = soup.find('head')
            if not head:
                head = soup.new_tag('head')
                soup.html.insert(0, head)
            if not head.find('title'):
                t = soup.new_tag('title')
                t.string = section
                head.append(t)

            try:
                title = soup.html.head.title.string
            except AttributeError:
                title = section
                #self.log.warn('object soup invalid!(%s)'%url)
                #continue

            title = self.processtitle(title)

            if self.keep_only_tags:
                body = soup.new_tag('body')
                try:
                    if isinstance(self.keep_only_tags, dict):
                        keep_only_tags = [self.keep_only_tags]
                    else:
                        keep_only_tags = self.keep_only_tags
                    for spec in keep_only_tags:
                        for tag in soup.find('body').find_all(**spec):
                            body.insert(len(body.contents), tag)
                    soup.find('body').replace_with(body)
                except AttributeError:  # soup has no body element
                    pass

            for spec in self.remove_tags_after:
                tag = soup.find(**spec)
                remove_beyond(tag, 'next_sibling')

            for spec in self.remove_tags_before:
                tag = soup.find(**spec)
                remove_beyond(tag, 'previous_sibling')

            remove_tags = self.insta_remove_tags + self.remove_tags
            remove_ids = self.insta_remove_ids + self.remove_ids
            remove_classes = self.insta_remove_classes + self.remove_classes
            remove_attrs = self.insta_remove_attrs + self.remove_attrs
            for tag in soup.find_all(remove_tags):
                tag.decompose()
            for id in remove_ids:
                for tag in soup.find_all(attrs={"id": id}):
                    tag.decompose()
            for cls in remove_classes:
                for tag in soup.find_all(attrs={"class": cls}):
                    tag.decompose()
            for attr in remove_attrs:
                for tag in soup.find_all(attrs={attr: True}):
                    del tag[attr]
            for cmt in soup.find_all(
                    text=lambda text: isinstance(text, Comment)):
                cmt.extract()

            #删除body的所有属性,以便InsertToc使用正则表达式匹配<body>
            body = soup.html.body
            bodyattrs = [attr for attr in body.attrs]
            for attr in bodyattrs:
                del body[attr]

            if self.extra_css:
                sty = soup.new_tag('style', type="text/css")
                sty.string = self.extra_css
                soup.html.head.append(sty)

            has_imgs = False
            thumbnail = None
            if self.keep_image:
                self.soupbeforeimage(soup)
                for img in soup.find_all('img'):
                    #现在使用延迟加载图片技术的网站越来越多了,这里处理一下
                    #注意:如果data-src之类的属性保存的不是真实url就没辙了
                    imgurl = img['src'] if 'src' in img.attrs else ''
                    if not imgurl:
                        for attr in img.attrs:
                            if attr != 'src' and 'src' in attr:  #很多网站使用data-src
                                imgurl = img[attr]
                                break
                    if not imgurl:
                        img.decompose()
                        continue
                    if not imgurl.startswith('data:'):
                        if not imgurl.startswith('http'):
                            imgurl = self.urljoin(url, imgurl)
                        if self.fetch_img_via_ssl and url.startswith(
                                'https://'):
                            imgurl = imgurl.replace('http://', 'https://')
                        if self.isfiltered(imgurl):
                            self.log.warn('img filtered:%s' % imgurl)
                            img.decompose()
                            continue
                    imgresult = opener.open(imgurl)
                    imgcontent = self.process_image(
                        imgresult.content,
                        opts) if imgresult.status_code == 200 else None
                    if imgcontent:
                        if len(imgcontent
                               ) < self.img_min_size:  #rexdf too small image
                            img.decompose()
                            continue

                        imgtype = imghdr.what(None, imgcontent)
                        if imgtype:
                            imgmime = r"image/" + imgtype
                            fnimg = "img%d.%s" % (self.imgindex,
                                                  'jpg' if imgtype == 'jpeg'
                                                  else imgtype)
                            img['src'] = fnimg

                            #使用第一个图片做为目录摘要图
                            if not has_imgs:
                                has_imgs = True
                                thumbnail = imgurl
                                yield (imgmime, imgurl, fnimg, imgcontent,
                                       None, True)
                            else:
                                yield (imgmime, imgurl, fnimg, imgcontent,
                                       None, None)
                        else:
                            img.decompose()
                    else:
                        self.log.warn(
                            'fetch img failed(%s):%s' %
                            (URLOpener.CodeMap(imgresult.status_code), imgurl))
                        img.decompose()

                #去掉图像上面的链接
                for img in soup.find_all('img'):
                    if img.parent and img.parent.parent and \
                        img.parent.name == 'a':
                        img.parent.replace_with(img)

            else:
                for img in soup.find_all('img'):
                    img.decompose()

            self.soupprocessex(soup)
            content = unicode(soup)

            #提取文章内容的前面一部分做为摘要
            brief = u''
            if GENERATE_TOC_DESC:
                for h in body.find_all(['h1', 'h2']):  # 去掉h1/h2,避免和标题重复
                    h.decompose()
                for s in body.stripped_strings:
                    brief += unicode(s) + u' '
                    if len(brief) >= TOC_DESC_WORD_LIMIT:
                        brief = brief[:TOC_DESC_WORD_LIMIT]
                        break
            soup = None

            content = self.postprocess(content)
            yield (section, url, title, content, brief, thumbnail)