Python ChRake示例，ChRake Python示例

示例#1

0

显示文件

    def parse_news(self, response):

        # populate the rest of the article
        article = response.meta['article']
        article['url'] = response.url

        title = ''
        date = ''

        parsed_response_url = urlparse(response.url)

        #news contents
        contents = ' '.join(
            response.xpath(
                '//div[@class="TRS_Editor"]//p/text()').extract()).strip()

        #news agency
        agency_ = response.xpath(
            '//div[@class="laiyuan"]//span[@id="articleSource"]/text()'
        ).extract()[0].replace(' ', '').replace('\n', '')
        agency = agency_[3:len(agency_)]

        #news category
        #category = response.xpath('//channel').extract()
        #Get keywords and tagged_text
        rake = ChRake()
        keywords_list = rake.run(contents)
        keywords = '\n'.join(keywords_list)
        tagged_text = rake.get_tagged_text()

        #populate agency,contents,category
        article['agency'] = agency
        article['contents'] = contents
        article['keywords'] = keywords
        article['tagged_text'] = tagged_text

        if 'cysc' in response.url:
            article['category'] = '产业市场'
        elif 'sjjj' in response.url:
            article['category'] = '国际经济'
        elif 'district' in response.url:
            article['category'] = '地方经济'
        elif 'gnsz' in response.url:
            article['category'] = '国内时政'
        elif 'shgj' in response.url:
            article['category'] = '社会'
        elif 'qqss' in response.url:
            article['category'] = '全球时事'
        elif 'finance' in response.url:
            article['category'] = '经济'
        elif 'specials' in response.url:
            article['category'] = '独家专稿'
        else:
            article['category'] = '其他'

        yield article

示例#2

0

显示文件

    def parse_news(self, response):

        article = response.meta['article']

        news_date = time.strftime('%Y-%m-%d %H:%M:%S',
                                  self.parse_date(article['date']))

        news_content = response.xpath(
            './/div[@id="center"]/div[@id="article"]/div[@class="article"]//text()'
        ).extract()
        news_content = ' '.join(news_content).strip()
        if len(news_content) < 5:
            news_content = response.xpath(
                './/div[@class="main pagewidth"]/div[@id="content"]/p/text()'
            ).extract()
            news_content = ' '.join(news_content).strip()

        news_author = response.xpath(
            './/div[@id="center"]/div[@id="article"]//em[@id="source"]//text()'
        ).extract()
        if len(news_author) > 0:
            news_author = news_author[0]
        else:
            news_author = response.xpath(
                './/div[@class="main_tit"]/div[@class="info"]/span[@id="source"]//text()'
            ).extract()[0]

        #Get keywords and tagged_text
        rake = ChRake()
        keywords_list = rake.run(news_content)
        keywords = '\n'.join(keywords_list)
        tagged_text = rake.get_tagged_text()

        #Populate
        article['contents'] = news_content
        article['date'] = news_date
        article['agency'] = news_author
        article['keywords'] = keywords
        article['tagged_text'] = tagged_text

        yield article
        #for comment page
        #comment_url = response.xpath('.//div[@id="da-comment"]//a[@target="_blank"]/@href').extract()

        comment_url = 'http://comment.home.news.cn/a/newsComm.do?_ksTS=1444922731622_49&callback=jsonp50&newsId=' + article[
            'aid']

        print comment_url
        #if len(comment_url) >0:
        req = scrapy.Request(comment_url,
                             callback=self.parse_comment,
                             dont_filter=True)
        req.meta['newsId'] = article['aid']
        yield req
        '''comment_check_url = 'http://m.news.naver.com/api/comment/count.json'

示例#3

0

显示文件

文件： globaltimes_spider.py 项目： snubdi/news_crawler

    def parse_next_page(self, response):
        try:
            article = response.meta['article']
            content = response.meta['contents']

            content_1 = response.xpath('//*[@id="text"]/p/text()').extract()
            content_1_1 = ''.join(content_1)

            #merger this page's content with previous content
            content_2 = content + content_1_1
            #Get keywords and tagged_text
            rake = ChRake()
            keywords_list = rake.run(content_2)
            keywords = '\n'.join(keywords_list)
            tagged_text = rake.get_tagged_text()

            #Populate
            article['contents'] = content_2
            article['keywords'] = keywords
            article['tagged_text'] = tagged_text


            this_page = response.url
            count = this_page[-6:-5]

            count_1 = int(count)+1

            str_1 = '//*[@id="pages"]/a['+str(count_1)+']/text()'
            str_2 = '//*[@id="pages"]/a['+str(count_1)+']/@href'
            count_2 = response.xpath(str_1).extract()

            #determine whether there has a next page
            if u'\u4e0b\u4e00\u9875' in count_2:
                yield article

            else:
                next_url = response.xpath(str_2).extract()
                next_url_1 = str(next_url[0])
                req = scrapy.Request(next_url_1, callback = self.parse_next_page)
                req.meta['article'] = article
                req.meta['contents'] = content_2
                req.meta['keywords'] = keywords
                req.meta['tagged_text'] = tagged_text
                yield req

        except Exception, e:
            print 'Parse_next_page ERROR!!!!!!!!!!!!!  :'+response.url
            print traceback.print_exc(file = sys.stdout)

示例#4

0

显示文件

    def parse_next_page(self, response):
        try:
            article = response.meta['article']
            content = response.meta['contents']

            content_1 = response.xpath('//*[@id="text"]/p/text()').extract()
            content_1_1 = ''.join(content_1)

            #merger this page's content with previous content
            content_2 = content + content_1_1
            #Get keywords and tagged_text
            rake = ChRake()
            keywords_list = rake.run(content_2)
            keywords = '\n'.join(keywords_list)
            tagged_text = rake.get_tagged_text()

            #Populate
            article['contents'] = content_2
            article['keywords'] = keywords
            article['tagged_text'] = tagged_text

            this_page = response.url
            count = this_page[-6:-5]

            count_1 = int(count) + 1

            str_1 = '//*[@id="pages"]/a[' + str(count_1) + ']/text()'
            str_2 = '//*[@id="pages"]/a[' + str(count_1) + ']/@href'
            count_2 = response.xpath(str_1).extract()

            #determine whether there has a next page
            if u'\u4e0b\u4e00\u9875' in count_2:
                yield article

            else:
                next_url = response.xpath(str_2).extract()
                next_url_1 = str(next_url[0])
                req = scrapy.Request(next_url_1, callback=self.parse_next_page)
                req.meta['article'] = article
                req.meta['contents'] = content_2
                req.meta['keywords'] = keywords
                req.meta['tagged_text'] = tagged_text
                yield req

        except Exception, e:
            print 'Parse_next_page ERROR!!!!!!!!!!!!!  :' + response.url
            print traceback.print_exc(file=sys.stdout)

示例#5

0

显示文件

文件： xinhua_spider.py 项目： snubdi/news_crawler

    def parse_news(self, response):

        article = response.meta['article']

        news_date = time.strftime('%Y-%m-%d %H:%M:%S', self.parse_date(article['date']))

        news_content = response.xpath('.//div[@id="center"]/div[@id="article"]/div[@class="article"]//text()').extract()
        news_content = ' '.join(news_content).strip()
        if len(news_content) <5:
            news_content = response.xpath('.//div[@class="main pagewidth"]/div[@id="content"]/p/text()').extract()
            news_content = ' '.join(news_content).strip()

        news_author = response.xpath('.//div[@id="center"]/div[@id="article"]//em[@id="source"]//text()').extract()
        if len(news_author)>0:
            news_author = news_author[0]
        else:
            news_author = response.xpath('.//div[@class="main_tit"]/div[@class="info"]/span[@id="source"]//text()').extract()[0]

        #Get keywords and tagged_text
        rake = ChRake()
        keywords_list = rake.run(news_content)
        keywords = '\n'.join(keywords_list)
        tagged_text = rake.get_tagged_text()

        #Populate
        article['contents'] = news_content
        article['date'] = news_date
        article['agency'] = news_author
        article['keywords'] = keywords
        article['tagged_text'] = tagged_text

        yield article
        #for comment page
        #comment_url = response.xpath('.//div[@id="da-comment"]//a[@target="_blank"]/@href').extract()

        comment_url = 'http://comment.home.news.cn/a/newsComm.do?_ksTS=1444922731622_49&callback=jsonp50&newsId=' + article['aid']

        print comment_url
        #if len(comment_url) >0:
        req = scrapy.Request(comment_url, callback = self.parse_comment, dont_filter = True)
        req.meta['newsId'] = article['aid']
        yield req
        '''comment_check_url = 'http://m.news.naver.com/api/comment/count.json'

示例#6

0

显示文件

文件： peoplenet_spider.py 项目： snubdi/news_crawler

    def parse_news(self, response):
        try:
            #get the rest of the article
            article = response.meta['article']
            #print '###############'
            agency = response.xpath('//div[@class="clearfix w1000_320 text_title"]//div[@class="box01"]//div[@class="fl"]//a/text()').extract()
            #agency = ''.join(agency_list)
            #agency = response.xpath('//*[@class="fl"]/a/text()').extract()
            #agency = response.xpath('.//div[@class="box01"]/a/text()').extract()
            #print agency
            #print '###############'
            #agency = response.xpath('//*[@id="p_origin"]/a/text()').extract()
            content_1 = response.xpath('//*[@id="rwb_zw"]/p/text()').extract()
            #print content_1
            article['agency'] = agency


            #get the cagegory of news
            category_url = response.url
            if 'world' in category_url:
                article['category'] = '国际'
            elif 'politics' in category_url:
                article['category'] = '时政'
            elif 'finance' in category_url:
                article['category'] = '财经'
            elif 'money' in category_url:
                article['category'] = '金融'
            elif 'energy' in category_url:
                article['category'] = '能源'
            elif 'legal' in category_url:
                article['category'] = '法治'
            elif 'society' in category_url:
                article['category'] = '社会'
            elif 'hm' in category_url:
                article['category'] = '港澳'
            elif 'pic' in category_url:
                article['category'] = '图片'
            elif 'tw' in category_url:
                article['category'] = '台湾'
            elif 'sports' in category_url:
                article['category'] = '体育'
            elif 'military' in category_url:
                article['category'] = '军事'
            elif 'health' in category_url:
                article['category'] = '健康'
            elif 'theory' in category_url:
                article['category'] = '理论'
            elif 'opinion' in category_url:
                article['category'] = '观点'
            elif 'media' in category_url:
                article['category'] = '传媒'
            elif 'ent' in category_url:
                article['category'] = '娱乐'
            elif 'it.people' in category_url:
                article['category'] = 'IT'
            elif 'env' in category_url:
                article['category'] = '环保'
            elif 'tc' in category_url:
                article['category'] = '通信'
            elif 'homea' in category_url:
                article['category'] = '家电'
            elif 'house' in category_url:
                article['category'] = '房产'
            elif 'ccnews' in category_url:
                article['category'] = '央企'
            elif 'scitech' in category_url:
                article['category'] = '科技'
            elif 'culture' in category_url:
                article['category'] = '文化'
            elif 'yuqing' in category_url:
                article['category'] = '舆情'
            elif 'lady' in category_url:
                article['category'] = '时尚'
            elif 'game' in category_url:
                article['category'] = '游戏'
            elif 'comic' in category_url:
                article['category'] = '动漫'
            elif 'npc.people' in category_url:
                article['category'] = '人大新闻'
            elif 'usa.people' in category_url:
                article['category'] = '美国'
            elif 'shipin' in category_url:
                article['category'] = '食品'
            elif 'edu.people' in category_url:
                article['category'] = '教育'
            elif 'gongyi' in category_url:
                article['category'] = '公益'
            elif 'jiaju' in category_url:
                article['category'] = '家居'
            elif 'qipai' in category_url:
                article['category'] = '棋牌'
            elif 'www.people' in category_url:
                article['category'] = '人民微博'
            else:
                article['category'] = '其他'


            #content = ''.join(content_1)
            #Get keywords and tagged_text
            content = ''.join(content_1).replace(' ','')
            rake = ChRake()
            keywords_list = rake.run(content)
            keywords = '\n'.join(keywords_list)
            tagged_text = rake.get_tagged_text()
          
            

            article['contents'] = content
            article['keywords'] = keywords
            article['tagged_text'] = tagged_text

            if response.xpath('//*[@id="rwb_zw"]/center/table/tbody/tr/td/a/text()'):

                next_url_0 = response.xpath('//*[@id="rwb_zw"]/div[2]/a[2]/@href').extract()
                pos = category_url.find("/n/")
                next_url = category_url[:pos]+str(next_url_0[0])
                req = scrapy.Request(next_url, callback = self.parse_next_page, dont_filter = self.dont_filter)
                req.meta['article'] = article
                req.meta['contents'] = ''.join(content_1).strip('')
                yield req


            else:
                yield response.meta['article']



            #get the json url of comments
            comment_url = category_url
            comment_json_url = 'http://bbs1.people.com.cn/api/news.do?action=lastNewsComments&newsId='+article['aid']
            req = scrapy.Request(comment_json_url, callback = self.parse_comment, dont_filter = self.dont_filter)
            yield req


        except Exception, e:
            print 'Parse_news ERROR!!!!!!!!!!!!!  URL :'+ article['url']
            print traceback.print_exc(file = sys.stdout)

示例#7

0

显示文件

文件： netease_spider.py 项目： snubdi/news_crawler

    def parse_news(self, response):
        time.sleep(1)
        try:
            # populate the rest of the article
            article = response.meta['article']
            aid = str(article['url'])[article['url'].rfind('/') + 1:-5]
            title = response.xpath(
                '//div[@class="post_content_main"]//h1/text()').extract()
            #print title

            agency = response.xpath(
                '//div[@class="post_time_source"]//a[@id="ne_article_source"]/text()'
            ).extract()
            #print agency

            content = response.xpath(
                '//div[@class="post_text"]//p/text()').extract()
            contents = ''.join(content)

            #Get keywords and tagged_text
            rake = ChRake()
            keywords_list = rake.run(contents)
            keywords = '\n'.join(keywords_list)
            tagged_text = rake.get_tagged_text()

            #Populate
            article['title'] = title[0]
            article['agency'] = agency[0]
            article['aid'] = aid
            article['contents'] = ''.join(content)
            article['keywords'] = keywords
            article['tagged_text'] = tagged_text

            yield response.meta['article']

            #star comment parsing
            category = article['category']
            news_url = article['url']
            print '=====================' + news_url
            comment_url_base = 'http://comment.news.163.com/cache/newlist/'
            '''
            #get page source
            pageSource = urllib2.urlopen(news_url).read().decode("gbk").encode("utf-8")
            #get boardId from page source
            c = re.search(r"(?<=boardId = ).+?(?=$)",pageSource,re.M)
            boardID = self.GetMiddleStr(c.group(),'"','",')
            if category == 0:
                boardID = 'news_guonei8_bbs'
            elif category == 1:
                boardID = 'news3_bbs'
            elif category == 2:
                boardID = 'news_shehui7_bbs'
            elif category == 4:
                boardID = 'news3_bbs'
            elif category == 5:
                boardID = 'news_junshi_bbs'
            comment_url = comment_url_base + boardID + '/' +  aid + '_1.html'
            print '=============' + comment_url
            '''
            #http://comment.news.163.com/api/v1/products/a2869674571f77b5a0867c3d71db5856/threads/BQLQTMO800014SEH/
            #comments/newList?offset=0&limit=30&showLevelThreshold=72&headLimit=1&tailLimit=2&callback=getData&ibc=newspc
            comment_url = 'http://comment.news.163.com/api/v1/products/a2869674571f77b5a0867c3d71db5856/threads/' \
            + aid  + '/comments/newList?offset=0&limit=30&showLevelThreshold=72&headLimit=1&tailLimit=2&callback=getData&ibc=newspc'

            req = scrapy.Request(comment_url,
                                 callback=self.parse_comment,
                                 dont_filter=self.dont_filter)
            req.meta['aid'] = aid
            yield req
        except Exception, e:
            print 'Parse_news ERROR!!!!!!!!!!!!!  URL :' + article['url']
            print traceback.print_exc(file=sys.stdout)