Exemplo n.º 1
0
    def content_parse(self, response):
        date = response.meta['date'] if 'date' in response.meta.keys() else None
        if date == None or len(date) == 0: return
        try:
            if helper.compare_time(date, self.limittime) < 0: return
        except:
            return

        pipleitem = CctvOpinionmonitor4Item()

        pipleitem['date'] = date
        pipleitem['id'] = response.meta['id'] if 'id' in response.meta.keys() else None
        pipleitem['url'] = response.url
        pipleitem['title'] = response.css('.title::text').extract_first()
        pipleitem['source'] = response.xpath('//div[@name="source"]/@content').extract_first()
        pipleitem['content'] = helper.list2str(response.css('.story-body__inner p').xpath('string(.)').extract())
        pipleitem['editor'] = response.css('.author a::text').extract_first()
        pipleitem['views'] = None
        pipleitem['image_urls'] = helper.list2str(response.css('img::attr(src)').extract())
        pipleitem['video_urls'] = helper.list2str(response.css('video::attr(src)').extract())
        pipleitem['share'] =  None
        pipleitem['like'] =  None
        pipleitem['dislike'] = None
        pipleitem['comment'] = None
        pipleitem['crawl_time'] = helper.get_localtimestamp()

        return pipleitem
Exemplo n.º 2
0
    def content_parse(self, response):
        jsonbd = json.loads(response.text)
        if jsonbd == None or len(jsonbd) == 0: return
        date = jsonbd['pub_time'] if 'pub_time' in jsonbd.keys() else None
        if date == None: return
        try:
            if helper.compare_time(date, self.limittime) < 0: return
        except:
            return

        pipleitem = CctvOpinionmonitor4Item()

        pipleitem['date'] = date
        pipleitem['id'] = jsonbd['id'] if 'id' in jsonbd.keys() else None
        pipleitem['url'] = response.url
        pipleitem['title'] = jsonbd['title'] if 'title' in jsonbd.keys(
        ) else None
        pipleitem['source'] = jsonbd['card'][
            'chlname'] if 'card' in jsonbd.keys() else None
        pipleitem['content'] = helper.list2str(
            re.findall('>(.*?)<', jsonbd['content']['text'])
        ) if 'content' in jsonbd.keys() and jsonbd['content'] != None else ''
        pipleitem['editor'] = None
        pipleitem['views'] = jsonbd['count_info'][
            'playcount'] if 'playcount' in jsonbd['count_info'].keys(
            ) else None
        videoList = []
        imageslist = []
        for i in jsonbd['attribute'].keys() if isinstance(
                jsonbd['attribute'], dict) else []:
            if re.search('VIDEO', i) != None:
                videoList.append(
                    jsonbd['attribute'][i]['playurl']
                ) if 'playurl' in jsonbd['attribute'][i].keys() else ''
            if re.search('IMG', i) != None:
                imageslist.append(
                    jsonbd['attribute'][i]
                    ['url']) if 'url' in jsonbd['attribute'][i].keys() else ''

        imageslist.append(
            jsonbd['imgurl']) if 'imgurl' in jsonbd.keys() else ''
        pipleitem['image_urls'] = helper.list2str(imageslist)
        pipleitem['video_urls'] = helper.list2str(videoList)
        pipleitem['share'] = jsonbd['count_info'][
            'share_count'] if 'share_count' in jsonbd['count_info'].keys(
            ) else None
        pipleitem['like'] = jsonbd['count_info'][
            'like_info'] if 'like_info' in jsonbd['count_info'].keys(
            ) else None
        pipleitem['dislike'] = None
        pipleitem['comment'] = jsonbd['count_info'][
            'comments'] if 'comments' in jsonbd['count_info'].keys() else None
        pipleitem['crawl_time'] = helper.get_localtimestamp()

        return pipleitem
    def parse(self, response):
        jsbd = json.loads(response.text)
        commentlist = {}

        for item in jsbd['comment_infos']['result']:
            commentlist[item['context_id']] = item['count']

        for item in jsbd['items']['result']:
            date = item['published_at'] if 'published_at' in item.keys(
            ) else None
            if date == None or len(str(date)) == 0: continue
            try:
                if helper.compare_time(
                        helper.get_makedtime('%Y-%m-%d %H:%M:%S', date),
                        self.limittime) < 0:
                    return
            except:
                return

            pipleitem = CctvOpinionmonitor4Item()

            pipleitem['date'] = helper.get_makedtime('%Y-%m-%d %H:%M:%S', date)
            pipleitem['id'] = item['context_id'] if 'context_id' in item.keys(
            ) else None
            pipleitem['url'] = item['link'] if 'link' in item.keys() else None
            pipleitem['title'] = item['title'] if 'title' in item.keys(
            ) else None
            pipleitem['source'] = item[
                'publisher'] if 'publisher' in item.keys() else None
            pipleitem['content'] = helper.list2str(
                re.findall(
                    '>(.*?)<',
                    item['content'])) if 'content' in item.keys() else ''
            pipleitem['editor'] = item[
                'author_name'] if 'author_name' in item.keys() else None
            pipleitem['views'] = None
            pipleitem['image_urls'] = helper.list2str(
                re.findall(
                    '<img.*?src="(.*?)"',
                    item['content'])) if 'content' in item.keys() else ''
            pipleitem['video_urls'] = helper.list2str(
                re.findall(
                    '<video.*?src="(.*?)"',
                    item['content'])) if 'content' in item.keys() else ''
            pipleitem['share'] = None
            pipleitem['like'] = None
            pipleitem['dislike'] = None
            pipleitem['comment'] = commentlist[
                item['context_id']] if 'context_id' in item.keys(
                ) and item['context_id'] in commentlist.keys() else None
            pipleitem['crawl_time'] = helper.get_localtimestamp()

            yield pipleitem
    def content_parse(self, response):
        jsonbd = json.loads(response.text)
        if jsonbd == None or len(jsonbd) == 0: return
        date = jsonbd['publish_time'] if 'publish_time' in jsonbd.keys(
        ) else None
        if date == None: return
        try:
            if helper.compare_time(
                    helper.get_makedtime('%Y-%m-%d %H:%M:%S', date),
                    self.limittime) < 0:
                return
        except:
            return

        pipleitem = CctvOpinionmonitor4Item()

        pipleitem['date'] = helper.get_makedtime('%Y-%m-%d %H:%M:%S', date)
        pipleitem['id'] = jsonbd['article_id'] if 'article_id' in jsonbd.keys(
        ) else None
        pipleitem['url'] = response.url
        pipleitem['title'] = jsonbd['title'] if 'title' in jsonbd.keys(
        ) else None
        pipleitem['source'] = response.meta[
            'mediaName'] if 'mediaName' in response.meta.keys() else None
        pipleitem['content'] = helper.list2str(
            re.findall('>(.*?)<', jsonbd['content'])
        ) if 'content' in jsonbd.keys() and jsonbd['content'] != None else ''
        pipleitem['editor'] = helper.list2str(
            jsonbd['media']['author']) if 'media' in jsonbd.keys(
            ) and 'author' in jsonbd['media'].keys() else None
        pipleitem['views'] = None
        imageList = []
        for i in jsonbd['images'] if 'images' in jsonbd.keys(
        ) and jsonbd['images'] != None else []:
            imageList.append(i['url'])
        pipleitem['image_urls'] = helper.list2str(imageList)
        pipleitem['video_urls'] = helper.list2str(
            jsonbd['videos']) if 'videos' in jsonbd.keys() else []
        pipleitem['share'] = None
        pipleitem['like'] = None
        pipleitem['dislike'] = None
        pipleitem['comment'] = response.meta[
            'comment'] if 'comment' in response.meta.keys() else None
        pipleitem['crawl_time'] = helper.get_localtimestamp()

        return pipleitem
Exemplo n.º 5
0
    def content_parse(self, response):
        date = response.xpath(
            '//div[@class="fl times"]/text()').extract_first()
        if date == None or len(date) == 0: return
        try:
            if helper.compare_time(date, self.limittime) < 0: return
        except:
            return

        pipleitem = CctvOpinionmonitor4Item()

        pipleitem['date'] = date
        pipleitem['id'] = response.meta['id'] if 'id' in response.meta.keys(
        ) else None
        pipleitem['url'] = response.url
        pipleitem['title'] = response.css('title::text').extract_first()
        pipleitem['source'] = re.sub(
            '来源:', '',
            response.xpath('//div[@class="fl origin"]/text()').extract_first())
        pipleitem['content'] = helper.list2str(
            response.css('.news-detail-cont').xpath('string(.)').extract())
        pipleitem['editor'] = None
        pipleitem['views'] = None
        pipleitem['image_urls'] = helper.list2str(
            response.css('.news-detail-cont img::attr(src)').extract())
        pipleitem['video_urls'] = None
        pipleitem['share'] = None
        pipleitem['like'] = response.css(
            '#support .num-total::text').extract_first()
        pipleitem['dislike'] = response.css(
            '#against .num-total::text').extract_first()
        self.commentheaders['Referer'] = response.url
        self.commentpar['articleId'] = response.meta['id']
        html = requests.post(
            url='https://comment.yorkbbs.ca/api/comment/getComment',
            data=self.commentpar,
            headers=self.commentheaders)
        pipleitem['comment'] = json.loads(html.text)['totalCount']
        pipleitem['crawl_time'] = helper.get_localtimestamp()

        return pipleitem
Exemplo n.º 6
0
    def content_parse(self, response):
        jsonbd = json.loads(response.text)
        if jsonbd == None or len(jsonbd) == 0 or 'data' not in jsonbd.keys():
            return
        jsonbd = jsonbd['data']
        date = jsonbd['datetime'] if 'datetime' in jsonbd.keys() else None
        if date == None: return
        try:
            if helper.compare_time(
                    helper.get_makedtime('%Y-%m-%d %H:%M:%S', date),
                    self.limittime) < 0:
                return
        except:
            return

        pipleitem = CctvOpinionmonitor4Item()

        pipleitem['date'] = helper.get_makedtime('%Y-%m-%d %H:%M:%S', date)
        pipleitem['id'] = jsonbd['ikey'] if 'ikey' in jsonbd.keys() else None
        pipleitem['url'] = response.url
        pipleitem['title'] = jsonbd['title'] if 'title' in jsonbd.keys(
        ) else None
        pipleitem['source'] = '多维新闻'
        pipleitem['content'] = helper.list2str(
            re.findall('>(.*?)<', jsonbd['content'])
        ) if 'content' in jsonbd.keys() and jsonbd['content'] != None else ''
        pipleitem['editor'] = response.meta[
            'author'] if 'author' in response.meta.keys() else None
        pipleitem['views'] = None
        pipleitem['image_urls'] = helper.list2str(
            jsonbd['picurl']) if 'picurl' in jsonbd.keys() else []
        pipleitem['video_urls'] = None
        pipleitem['share'] = None
        pipleitem['like'] = None
        pipleitem['dislike'] = None
        pipleitem['comment'] = None
        pipleitem['crawl_time'] = helper.get_localtimestamp()

        return pipleitem
    def content_parse(self, response):
        date = response.meta['date'] if 'date' in response.meta.keys(
        ) else None
        if date == None or len(date) == 0: return
        try:
            if helper.compare_time(date, self.limittime) < 0: return
        except:
            return

        pipleitem = CctvOpinionmonitor4Item()

        pipleitem['date'] = date
        pipleitem['id'] = response.meta['id'] if 'id' in response.meta.keys(
        ) else None
        pipleitem['url'] = response.url
        pipleitem['title'] = response.css(
            '.container h3::text').extract_first()
        pipleitem['source'] = response.xpath(
            '//div[@class="source imedia"]/text()').extract_first()
        pipleitem['content'] = helper.list2str(
            response.css('#js-article p').xpath('string(.)').extract())
        editor = response.css('#yidian_editor::text').extract_first()
        pipleitem['editor'] = re.sub('责任编辑:', '',
                                     editor) if editor != None else ''
        pipleitem['views'] = None
        pipleitem['image_urls'] = helper.list2str(
            response.css('#yidian_editor img::attr(src)').extract())
        pipleitem['video_urls'] = helper.list2str(
            response.css('#yidian_editor video::attr(src)').extract())
        pipleitem['share'] = None
        pipleitem['like'] = response.meta[
            'like'] if 'like' in response.meta.keys() else 0
        pipleitem['dislike'] = None
        pipleitem['comment'] = response.meta[
            'comment_count'] if 'comment_count' in response.meta.keys() else 0
        pipleitem['crawl_time'] = helper.get_localtimestamp()

        return pipleitem
    def content_parse(self, response):
        date = response.meta['post_date'] if 'post_date' in response.meta.keys(
        ) else None
        if date == None or len(date) == 0: return
        try:
            if helper.compare_time(date, self.limittime) < 0: return
        except:
            return

        pipleitem = CctvOpinionmonitor4Item()

        pipleitem['date'] = helper.formatTime(date)
        pipleitem['id'] = response.meta['id'] if 'id' in response.meta.keys(
        ) else None
        pipleitem['url'] = response.url
        pipleitem['title'] = response.css(
            '#arcmaintitle::text').extract_first()
        pipleitem['source'] = response.meta[
            'source'] if 'source' in response.meta.keys() else None
        pipleitem['content'] = helper.list2str(
            response.css('#arcbody').xpath('string(.)').extract())
        pipleitem['editor'] = response.meta[
            'author'] if 'author' in response.meta.keys() else None
        pipleitem['views'] = None
        imageList = []
        for i in response.css('#arcbody img::attr(src)').extract():
            imageList.append('https://info.51.ca{}'.format(i))
        pipleitem['image_urls'] = helper.list2str(imageList)
        pipleitem['video_urls'] = None
        pipleitem['share'] = None
        pipleitem['like'] = None
        pipleitem['dislike'] = None
        pipleitem['comment'] = response.meta[
            'comments_num'] if 'comments_num' in response.meta.keys() else None
        pipleitem['crawl_time'] = helper.get_localtimestamp()

        return pipleitem
Exemplo n.º 9
0
    def content_parse(self, response):
        date = response.css('.author-timestamp::attr(content)').extract_first()
        date = helper.list2str(
            re.findall('(\d{4}-\d{2}-\d{2}|\d{2}:\d{2})', date))
        if date == None or len(date) == 0: return
        try:
            if helper.compare_time(date, self.limittime) < 0: return
        except:
            return

        pipleitem = CctvOpinionmonitor4Item()

        pipleitem['date'] = date
        id = re.findall('(\d{2,4}/).*', response.url)
        pipleitem['id'] = id[0] if len(id) > 0 else None
        pipleitem['url'] = response.url
        pipleitem['title'] = response.css('.title::text').extract_first()
        pipleitem['source'] = 'WashingtonPost'
        pipleitem['content'] = helper.list2str(
            response.css('.paywall p').xpath('string(.)').extract())
        pipleitem['editor'] = response.css('.author::text').extract_first()
        pipleitem['views'] = None
        pipleitem['image_urls'] = helper.list2str(
            response.css('.paywall img::attr(src)').extract())
        pipleitem['video_urls'] = helper.list2str(
            response.css('.paywall video::attr(src)').extract())
        pipleitem['share'] = None
        pipleitem['like'] = None
        pipleitem['dislike'] = None
        url = re.findall('"@id":(\S*)', response.text)[0]
        rs = requests.get(url='{asset(url:' + url + '){totalCommentCount}}',
                          headers=self.headers).text
        pipleitem['comment'] = re.findall(
            '\d*', rs)[0] if len(re.findall('\d*', rs)) > 0 else None
        pipleitem['crawl_time'] = helper.get_localtimestamp()

        return pipleitem