Пример #1
0
    def content_parse(self, response):
        jsonbd = json.loads(response.text)
        if len(jsonbd['data']) == 0: return
        date = response.meta['date']
        if date == None or len(str(date)) == 0: return
        try:
            if helper.compare_time(helper.get_makedtime('%Y-%m-%d',date/1000), self.limittime) < 0: return
        except:
            return

        pipleitem = CctvOpinionmonitor3Item()

        pipleitem['date'] = helper.get_makedtime('%Y-%m-%d %H:%M:%S',date/1000)
        pipleitem['id'] = response.meta['id']
        pipleitem['url'] = response.url
        pipleitem['title'] = response.meta['title']
        pipleitem['source'] = response.meta['source']
        pipleitem['editor'] = jsonbd['data']['redactor'] if 'redactor' in jsonbd['data'].keys() else None
        pipleitem['content'] = helper.list2str(re.findall('>(.*?)<',jsonbd['data']['content'] if 'content' in jsonbd['data'].keys() else ''))
        pipleitem['image_urls'] = helper.list2str(re.findall('<img.*?src="(.*?)"',jsonbd['data']['content'] if 'content' in jsonbd['data'].keys() else ''))
        pipleitem['video_urls'] = jsonbd['data']['audio_video_url'] if 'audio_video_url' in jsonbd['data'].keys() else None
        pipleitem['share'] = None
        pipleitem['like'] = None
        pipleitem['dislike'] = None
        pipleitem['views'] = None
        pipleitem['comment'] = jsonbd['data']['comment_count'] if 'comment_count' in jsonbd['data'].keys() else None
        pipleitem['crawl_time'] = helper.get_localtimestamp()

        return pipleitem
    def content_parse(self, response):
        date = response.meta['date']
        if date == None or len(date) == 0: return
        try:
            if helper.compare_time(date, self.limittime) < 0: return
        except:
            return

        pipleitem = CctvOpinionmonitor3Item()

        pipleitem['date'] = helper.formatTime(date)
        pipleitem['id'] = response.meta['id']
        pipleitem['url'] = response.url
        pipleitem['title'] = response.meta['title']
        pipleitem['source'] = response.css(
            '#detail_Info_Tab_cout4_1 p:nth-last-child(2)::text'
        ).extract_first()
        pipleitem['editor'] = response.meta['editor']
        pipleitem['content'] = helper.list2str(
            response.xpath(
                'string(//div[@id="detail_infotab_cont_1"])').extract())
        pipleitem['image_urls'] = response.meta['pic']
        pipleitem['video_urls'] = helper.list2str(
            response.css('#videoPath::attr(value)').extract())
        pipleitem['share'] = None
        pipleitem['like'] = None
        pipleitem['dislike'] = None
        pipleitem['views'] = None
        pipleitem['comment'] = None
        pipleitem['crawl_time'] = helper.get_localtimestamp()

        return pipleitem
    def content_parse(self, response):
        date = response.xpath('//span[@class="date style-scope ytd-video-secondary-info-renderer"]/text()').extract_first()
        if date == None or len(date) == 0: return
        try:
            if helper.compare_time(date[0], self.limittime) < 0: return
        except:
            return

        pipleitem = YoutubeItem()

        pipleitem['date'] = helper.formatTime(date)
        pipleitem['id'] = re.findall('v=(\S*)', response.url)[0]
        pipleitem['url'] = response.url
        pipleitem['title'] = response.css('title::text').extract_first()
        pipleitem['source'] = 'Youtube'
        pipleitem['editor'] = response.xpath('//yt-formatted-string[@id="owner-name"]/a/text()').extract_first()
        pipleitem['content'] = helper.list2str(response.css('#description').xpath('string(.)').extract()).strip()
        views = response.xpath('//span[@class="view-count style-scope yt-view-count-renderer"]/text()').extract_first()
        pipleitem['views'] = re.sub('\D','',views) if len(views) > 0 else '0'
        pipleitem['image_urls'] = helper.list2str(response.css('img::attr(src)').extract())
        pipleitem['video_urls'] = helper.list2str(response.css('video::attr(src)').extract())
        pipleitem['share'] = None
        tmp = response.xpath('//yt-formatted-string[@id="text"]/@aria-label').extract()
        for i in tmp:
            i = i.replace('No','0')
            pipleitem['like'] = re.sub('\D','',i) if re.search('likes',i) != None else '0'
            pipleitem['dislike'] = re.sub('\D','',i) if re.search('dislikes',i) != None else '0'

        comment = response.xpath('//h2[@id="count"]/yt-formatted-string/text()').extract_first()
        pipleitem['comment'] = re.sub('\D','',comment) if len(comment) > 0 else 0
        pipleitem['subscriber'] = response.meta['subscriber']

        pipleitem['crawl_time'] = helper.get_localtimestamp()

        return pipleitem
Пример #4
0
    def content_parse(self, response):
        jsonbd = json.loads(response.text)
        if len(jsonbd) == 0: return
        date = jsonbd['result']['createTime'] if 'result' in jsonbd.keys() else None
        date = helper.get_makedtime('%Y-%m-%d %H:%M:%S',time.mktime(time.strptime(date, '%a %b %d %H:%M:%S CST %Y')))
        if date == None: return
        try:
            if helper.compare_time(date, self.limittime) < 0: return
        except:
            return

        pipleitem = CctvOpinionmonitor3Item()

        result = jsonbd['result']
        pipleitem['date'] = date
        pipleitem['id'] = result['id'] if 'id' in result.keys() else None
        pipleitem['url'] = response.url
        pipleitem['title'] = result['title'] if 'title' in result.keys() else None
        pipleitem['source'] = result['docSource'] if 'docSource' in result.keys() else None
        pipleitem['content'] = helper.list2str(re.findall('>(.*?)<',result['content'])) if 'content' in result.keys() and result['content'] != None else ''
        pipleitem['editor'] = result['author'] if 'author' in result.keys() else None
        pipleitem['views'] = None
        imageList = []
        for i in result['imgList'] if 'imgList' in result.keys() and result['imgList'] != None else []:
            imageList.append(i)

        pipleitem['image_urls'] = helper.list2str(imageList)
        pipleitem['video_urls'] = response.meta['videourl'] if 'videourl' in response.meta.keys() else None
        pipleitem['share'] =  None
        pipleitem['like'] =  None
        pipleitem['dislike'] = None
        pipleitem['comment'] = result['comments'] if 'comments' in result.keys() else None
        pipleitem['crawl_time'] = helper.get_localtimestamp()

        return pipleitem
    def content_parse(self, response):
        date = response.css('h3.subheader time::text').extract_first()
        if date == None or len(date) == 0: return
        try:
            if helper.compare_time(date, self.limittime) < 0: return
        except:
            return

        pipleitem = CctvOpinionmonitor3Item()

        pipleitem['date'] = helper.formatTime(date)
        id = re.findall('diary/(.*)/',response.url)
        pipleitem['id'] = id[0] if id != None and len(id) > 0 else None
        pipleitem['url'] = response.url
        pipleitem['title'] = response.css('title::text').extract_first()
        pipleitem['source'] = '中日通'
        pipleitem['editor'] = None
        pipleitem['content'] = helper.list2str(response.css('.markdown').xpath('string(.)').extract())
        pipleitem['image_urls'] = helper.list2str(response.css('.markdown img::attr(src)').extract())
        pipleitem['video_urls'] = helper.list2str(response.css('video::attr(src)').extract())
        pipleitem['share'] = None
        pipleitem['like'] = response.css('.likes_count::text').extract_first()
        pipleitem['dislike'] = None
        pipleitem['views'] = None
        pipleitem['comment'] = None
        pipleitem['crawl_time'] = helper.get_localtimestamp()

        return pipleitem
    def content_parse(self, response):
        jsonbd = json.loads(response.text)
        info = jsonbd['dramaInfo'] if 'dramaInfo' in jsonbd.keys() else None
        date = info['updateDesc'] if 'updateDesc' in info.keys() else None
        date = re.findall('[\d-]+', date)
        if date == None or len(date) == 0: return
        try:
            if helper.compare_time(date[0], self.limittime) < 0: return
        except:
            return

        pipleitem = CctvOpinionmonitor3Item()

        pipleitem['date'] = helper.formatTime(date[0])
        pipleitem['id'] = info['contId'] if 'contId' in info.keys() else None
        pipleitem['url'] = response.url
        pipleitem['title'] = response.meta[
            'name'] if 'name' in response.meta.keys() else None
        pipleitem['source'] = info['type'] if 'type' in info.keys() else None
        pipleitem['editor'] = None
        pipleitem['content'] = info[
            'description'] if 'description' in info.keys() else None
        pipleitem['image_urls'] = info['imageURL'] if 'imageURL' in info.keys(
        ) else None
        pipleitem['video_urls'] = info[
            'requestURL'] if 'requestURL' in info.keys() else None
        pipleitem['share'] = None
        pipleitem['like'] = None
        pipleitem['dislike'] = None
        pipleitem['views'] = None
        pipleitem['comment'] = None
        pipleitem['crawl_time'] = helper.get_localtimestamp()

        return pipleitem
    def content_parse(self, response):
        date = response.meta['newsTime'] if 'newsTime' in response.meta.keys() else None
        if date == None or len(date) == 0: return
        try:
            if helper.compare_time(date, self.limittime) < 0: return
        except:
            return

        pipleitem = CctvOpinionmonitor3Item()

        pipleitem['date'] = helper.formatTime(date)
        pipleitem['id'] = response.meta['newsId'] if 'newsId' in response.meta.keys() else None
        pipleitem['url'] = response.url
        pipleitem['title'] = response.css('title::text').extract_first()
        pipleitem['source'] = response.meta['newsResource'] if 'newsResource' in response.meta.keys() else None
        pipleitem['editor'] = None
        pipleitem['content'] = helper.list2str(response.xpath('string(//div[@class="m_details-con"])').extract())
        pipleitem['image_urls'] = helper.list2str(response.meta['picUrlList'] if 'picUrlList' in response.meta.keys() else [])
        pipleitem['video_urls'] = response.css('audio::attr(src)').extract_first()
        pipleitem['share'] = None
        pipleitem['like'] = None
        pipleitem['dislike'] = None
        pipleitem['views'] = response.meta['articleCount'] if 'articleCount' in response.meta.keys() else None
        pipleitem['comment'] = response.meta['commentNum'] if 'commentNum' in response.meta.keys() else None
        pipleitem['crawl_time'] = helper.get_localtimestamp()

        return pipleitem
    def video_parse(self, response):
        date = response.xpath(
            '//meta[@name="pubdate"]/@content').extract_first()
        if date == None or len(date) == 0: return
        try:
            if helper.compare_time(helper.formatTime2(date.strip()),
                                   self.limittime) < 0:
                return
        except:
            return

        pipleitem = CctvOpinionmonitor3Item()

        pipleitem['date'] = helper.formatTime(date.strip())
        id = re.findall('com/(.*)/', response.url)
        pipleitem['id'] = id[0] if id != None and len(id) > 0 else None
        pipleitem['url'] = response.url
        pipleitem['title'] = response.css('title::text').extract_first()
        pipleitem['source'] = 'CNN'
        pipleitem['editor'] = response.xpath(
            '//meta[@name="author"]/@content').extract_first()
        pipleitem['content'] = helper.list2str(
            response.css('#[id~=js-video_description]::text').extract())
        pipleitem['image_urls'] = helper.list2str(
            response.css('img::attr(src)').extract())
        pipleitem['video_urls'] = helper.list2str(
            re.findall('"videoUrl":\s*"(.*?)"', response.text))
        pipleitem['share'] = None
        pipleitem['like'] = None
        pipleitem['dislike'] = None
        pipleitem['views'] = None
        pipleitem['comment'] = None
        pipleitem['crawl_time'] = helper.get_localtimestamp()

        return pipleitem
Пример #9
0
    def content_parse(self, response):
        jsonbd = re.findall('\{.*}', response.text)
        jsonbd = json.loads(jsonbd[0]) if len(jsonbd) > 0 else '{}'
        if len(jsonbd) == 0: return
        date = jsonbd['releasedate'] if 'releasedate' in jsonbd.keys(
        ) else None
        if date == None: return
        try:
            if helper.compare_time(date, self.limittime) < 0: return
        except:
            return

        pipleitem = CctvOpinionmonitor3Item()

        pipleitem['date'] = date
        pipleitem['id'] = jsonbd['id'] if 'id' in jsonbd.keys() else None
        pipleitem['url'] = response.url
        pipleitem['title'] = jsonbd['topic'] if 'topic' in jsonbd.keys(
        ) else None
        pipleitem['source'] = jsonbd[
            'docSource'] if 'docSource' in jsonbd.keys() else None
        pipleitem['content'] = helper.list2str(
            re.findall('>(.*?)<', jsonbd['content']) if
            'content' in jsonbd.keys() and jsonbd['content'] != None else '')
        editor = re.findall('编辑:(.*?)<', pipleitem['content'])
        pipleitem['editor'] = editor[0] if len(editor) > 0 else None
        try:
            html = requests.get(
                'http://xhpfmapi.zhongguowangshi.com/v600/news/detail.js?docid={}&share=1'
                .format(jsonbd['id']))
            views = re.findall('"pvs":(\d*)', html.text)
        except:
            pass
        pipleitem['views'] = views[0] if len(views) > 0 else None
        imageList = []
        for i in jsonbd['contentimglist'] if 'contentimglist' in jsonbd.keys(
        ) and jsonbd['contentimglist'] != None else []:
            imageList.append(i['smallImageLink'])

        pipleitem['image_urls'] = helper.list2str(imageList)
        pipleitem['video_urls'] = response.meta[
            'videourl'] if 'videourl' in response.meta.keys() else None
        pipleitem['share'] = jsonbd['share'] if 'share' in jsonbd.keys(
        ) else None
        pipleitem['like'] = jsonbd[
            'supportcount'] if 'supportcount' in jsonbd.keys() else None
        pipleitem['dislike'] = None
        pipleitem['comment'] = jsonbd['comment'] if 'comment' in jsonbd.keys(
        ) else None
        pipleitem['crawl_time'] = helper.get_localtimestamp()

        return pipleitem
    def content_parse(self, response):
        jsonbd = json.loads(response.text)
        if len(jsonbd) == 0: return
        cards = jsonbd[0]
        date = cards['time'] if 'time' in cards.keys() else None

        if date == None or len(date) == 0: return
        try:
            if helper.compare_time(date, self.limittime) < 0: return
        except:
            return

        pipleitem = CctvOpinionmonitor3Item()

        pipleitem['date'] = helper.formatTime(date)
        pipleitem['id'] = cards['GlobalID'] if 'GlobalID' in cards.keys(
        ) else None
        pipleitem['url'] = response.url
        pipleitem['title'] = cards['title'] if 'title' in cards.keys(
        ) else None
        pipleitem['source'] = cards['docfrom'] if 'docfrom' in cards.keys(
        ) else None
        pipleitem['editor'] = cards['autor'] if 'autor' in cards.keys(
        ) else None
        pipleitem['content'] = cards['summary'] if 'summary' in cards.keys(
        ) else None
        pipleitem['image_urls'] = cards['photo'] if 'photo' in cards.keys(
        ) else None
        pipleitem['video_urls'] = cards[
            'videourl'] if 'videourl' in cards.keys() else None

        html = requests.get(url=self.url.format(pipleitem['id']))
        bd = json.loads(html.text)
        if 'result' not in bd.keys() or len(bd['result']) == 0:
            bd = None
        else:
            bd = bd['result'][0]

        pipleitem['share'] = None
        pipleitem['like'] = None
        pipleitem['dislike'] = None
        pipleitem['views'] = bd['views'] if bd != None and 'views' in bd.keys(
        ) else None
        pipleitem['comment'] = bd[
            'commentnum'] if bd != None and 'commentnum' in bd.keys() else None
        pipleitem['crawl_time'] = helper.get_localtimestamp()

        # 'http://apiapp.people.cn/apiv3.3.0/get_article_info.php?globalids=1061648&parents=0,0,0,0,0,0&juxian_liveid=0,0,0,0,0,0&juxian_companyid=0,0,0,0,0,0&deviceid=A000009114F247'

        return pipleitem
Пример #11
0
    def content_parse(self, response):
        jsonbd = json.loads(response.text)
        if 'cardgroups' not in jsonbd.keys() or len(jsonbd['cardgroups']) == 0:
            return
        cards = jsonbd['cardgroups'][0] if len(
            jsonbd['cardgroups']) != 0 else {}
        cards = cards['cards'][0] if len(cards['cards']) != 0 else {}

        date = helper.formatTime(
            cards['date']) if 'date' in cards.keys() else None
        if date == None or len(date) == 0: return
        try:
            if helper.compare_time(date, self.limittime) < 0: return
        except:
            return

        pipleitem = CctvOpinionmonitor3Item()

        pipleitem['date'] = date
        pipleitem['id'] = response.meta['id']
        pipleitem['url'] = response.url
        pipleitem['title'] = cards['title'] if 'title' in cards.keys(
        ) else None
        pipleitem['source'] = cards['source'] if 'source' in cards.keys(
        ) else None
        pipleitem['editor'] = None
        pipleitem['content'] = helper.list2str(
            re.findall('>(.*?)<',
                       cards['content'] if 'content' in cards.keys() else ''))
        pipleitem['image_urls'] = helper.list2str(
            cards['photoList'] if 'photoList' in cards.keys() else [])
        pipleitem['video_urls'] = cards['video'][
            'url'] if 'video' in cards.keys() else None
        pipleitem['share'] = None
        pipleitem['like'] = None
        pipleitem['dislike'] = None
        pipleitem['views'] = None
        pipleitem['comment'] = None
        pipleitem['crawl_time'] = helper.get_localtimestamp()

        return pipleitem
    def content_parse(self, response):
        date = response.css('span.date::text').extract_first()
        if date == None or len(date) == 0: return
        try:
            if helper.compare_time(helper.formatTime(date.strip()), self.limittime) < 0: return
        except:
            return

        pipleitem = CctvOpinionmonitor3Item()

        pipleitem['date'] = helper.formatTime(date.strip())
        id = re.findall('news/([a-z\d]*)',response.url)
        pipleitem['id'] = id[0] if id != None and len(id) > 0 else None
        pipleitem['url'] = response.url
        pipleitem['title'] = response.css('title::text').extract_first()
        pipleitem['source'] = 'CGTN'
        pipleitem['editor'] = response.xpath('//div[@class="news-author news-text"]/text()').extract_first()
        content = helper.list2str(response.xpath('string(//div[@id="cmsMainContent"])').extract())
        pipleitem['content'] = content if len(content) > 10 else helper.list2str(response.css('#cmsMainContent::attr(data-json)').extract())
        pipleitem['image_urls'] = helper.list2str(response.css('.cg-padding img::attr(src)').extract())

        list = []
        if response.meta['key'] == 'live':
            html = requests.get('https://mapi.cgtn.com/mobileapp/v2/live/event/info?id={}'.format(pipleitem['id'])).text
            for i in json.loads(html)['response']['videos']:
                list.append(i['url'])

        pipleitem['video_urls'] = helper.list2str(list)
        pipleitem['share'] = None
        pipleitem['like'] = None
        pipleitem['dislike'] = None
        pipleitem['views'] = None
        pipleitem['comment'] = None
        pipleitem['crawl_time'] = helper.get_localtimestamp()

        return pipleitem