def content_parse(self, response): jsonbd = json.loads(response.text) if len(jsonbd['data']) == 0: return date = response.meta['date'] if date == None or len(str(date)) == 0: return try: if helper.compare_time(helper.get_makedtime('%Y-%m-%d',date/1000), self.limittime) < 0: return except: return pipleitem = CctvOpinionmonitor3Item() pipleitem['date'] = helper.get_makedtime('%Y-%m-%d %H:%M:%S',date/1000) pipleitem['id'] = response.meta['id'] pipleitem['url'] = response.url pipleitem['title'] = response.meta['title'] pipleitem['source'] = response.meta['source'] pipleitem['editor'] = jsonbd['data']['redactor'] if 'redactor' in jsonbd['data'].keys() else None pipleitem['content'] = helper.list2str(re.findall('>(.*?)<',jsonbd['data']['content'] if 'content' in jsonbd['data'].keys() else '')) pipleitem['image_urls'] = helper.list2str(re.findall('<img.*?src="(.*?)"',jsonbd['data']['content'] if 'content' in jsonbd['data'].keys() else '')) pipleitem['video_urls'] = jsonbd['data']['audio_video_url'] if 'audio_video_url' in jsonbd['data'].keys() else None pipleitem['share'] = None pipleitem['like'] = None pipleitem['dislike'] = None pipleitem['views'] = None pipleitem['comment'] = jsonbd['data']['comment_count'] if 'comment_count' in jsonbd['data'].keys() else None pipleitem['crawl_time'] = helper.get_localtimestamp() return pipleitem
def content_parse(self, response): date = response.meta['date'] if date == None or len(date) == 0: return try: if helper.compare_time(date, self.limittime) < 0: return except: return pipleitem = CctvOpinionmonitor3Item() pipleitem['date'] = helper.formatTime(date) pipleitem['id'] = response.meta['id'] pipleitem['url'] = response.url pipleitem['title'] = response.meta['title'] pipleitem['source'] = response.css( '#detail_Info_Tab_cout4_1 p:nth-last-child(2)::text' ).extract_first() pipleitem['editor'] = response.meta['editor'] pipleitem['content'] = helper.list2str( response.xpath( 'string(//div[@id="detail_infotab_cont_1"])').extract()) pipleitem['image_urls'] = response.meta['pic'] pipleitem['video_urls'] = helper.list2str( response.css('#videoPath::attr(value)').extract()) pipleitem['share'] = None pipleitem['like'] = None pipleitem['dislike'] = None pipleitem['views'] = None pipleitem['comment'] = None pipleitem['crawl_time'] = helper.get_localtimestamp() return pipleitem
def content_parse(self, response): date = response.xpath('//span[@class="date style-scope ytd-video-secondary-info-renderer"]/text()').extract_first() if date == None or len(date) == 0: return try: if helper.compare_time(date[0], self.limittime) < 0: return except: return pipleitem = YoutubeItem() pipleitem['date'] = helper.formatTime(date) pipleitem['id'] = re.findall('v=(\S*)', response.url)[0] pipleitem['url'] = response.url pipleitem['title'] = response.css('title::text').extract_first() pipleitem['source'] = 'Youtube' pipleitem['editor'] = response.xpath('//yt-formatted-string[@id="owner-name"]/a/text()').extract_first() pipleitem['content'] = helper.list2str(response.css('#description').xpath('string(.)').extract()).strip() views = response.xpath('//span[@class="view-count style-scope yt-view-count-renderer"]/text()').extract_first() pipleitem['views'] = re.sub('\D','',views) if len(views) > 0 else '0' pipleitem['image_urls'] = helper.list2str(response.css('img::attr(src)').extract()) pipleitem['video_urls'] = helper.list2str(response.css('video::attr(src)').extract()) pipleitem['share'] = None tmp = response.xpath('//yt-formatted-string[@id="text"]/@aria-label').extract() for i in tmp: i = i.replace('No','0') pipleitem['like'] = re.sub('\D','',i) if re.search('likes',i) != None else '0' pipleitem['dislike'] = re.sub('\D','',i) if re.search('dislikes',i) != None else '0' comment = response.xpath('//h2[@id="count"]/yt-formatted-string/text()').extract_first() pipleitem['comment'] = re.sub('\D','',comment) if len(comment) > 0 else 0 pipleitem['subscriber'] = response.meta['subscriber'] pipleitem['crawl_time'] = helper.get_localtimestamp() return pipleitem
def content_parse(self, response): jsonbd = json.loads(response.text) if len(jsonbd) == 0: return date = jsonbd['result']['createTime'] if 'result' in jsonbd.keys() else None date = helper.get_makedtime('%Y-%m-%d %H:%M:%S',time.mktime(time.strptime(date, '%a %b %d %H:%M:%S CST %Y'))) if date == None: return try: if helper.compare_time(date, self.limittime) < 0: return except: return pipleitem = CctvOpinionmonitor3Item() result = jsonbd['result'] pipleitem['date'] = date pipleitem['id'] = result['id'] if 'id' in result.keys() else None pipleitem['url'] = response.url pipleitem['title'] = result['title'] if 'title' in result.keys() else None pipleitem['source'] = result['docSource'] if 'docSource' in result.keys() else None pipleitem['content'] = helper.list2str(re.findall('>(.*?)<',result['content'])) if 'content' in result.keys() and result['content'] != None else '' pipleitem['editor'] = result['author'] if 'author' in result.keys() else None pipleitem['views'] = None imageList = [] for i in result['imgList'] if 'imgList' in result.keys() and result['imgList'] != None else []: imageList.append(i) pipleitem['image_urls'] = helper.list2str(imageList) pipleitem['video_urls'] = response.meta['videourl'] if 'videourl' in response.meta.keys() else None pipleitem['share'] = None pipleitem['like'] = None pipleitem['dislike'] = None pipleitem['comment'] = result['comments'] if 'comments' in result.keys() else None pipleitem['crawl_time'] = helper.get_localtimestamp() return pipleitem
def content_parse(self, response): date = response.css('h3.subheader time::text').extract_first() if date == None or len(date) == 0: return try: if helper.compare_time(date, self.limittime) < 0: return except: return pipleitem = CctvOpinionmonitor3Item() pipleitem['date'] = helper.formatTime(date) id = re.findall('diary/(.*)/',response.url) pipleitem['id'] = id[0] if id != None and len(id) > 0 else None pipleitem['url'] = response.url pipleitem['title'] = response.css('title::text').extract_first() pipleitem['source'] = '中日通' pipleitem['editor'] = None pipleitem['content'] = helper.list2str(response.css('.markdown').xpath('string(.)').extract()) pipleitem['image_urls'] = helper.list2str(response.css('.markdown img::attr(src)').extract()) pipleitem['video_urls'] = helper.list2str(response.css('video::attr(src)').extract()) pipleitem['share'] = None pipleitem['like'] = response.css('.likes_count::text').extract_first() pipleitem['dislike'] = None pipleitem['views'] = None pipleitem['comment'] = None pipleitem['crawl_time'] = helper.get_localtimestamp() return pipleitem
def content_parse(self, response): jsonbd = json.loads(response.text) info = jsonbd['dramaInfo'] if 'dramaInfo' in jsonbd.keys() else None date = info['updateDesc'] if 'updateDesc' in info.keys() else None date = re.findall('[\d-]+', date) if date == None or len(date) == 0: return try: if helper.compare_time(date[0], self.limittime) < 0: return except: return pipleitem = CctvOpinionmonitor3Item() pipleitem['date'] = helper.formatTime(date[0]) pipleitem['id'] = info['contId'] if 'contId' in info.keys() else None pipleitem['url'] = response.url pipleitem['title'] = response.meta[ 'name'] if 'name' in response.meta.keys() else None pipleitem['source'] = info['type'] if 'type' in info.keys() else None pipleitem['editor'] = None pipleitem['content'] = info[ 'description'] if 'description' in info.keys() else None pipleitem['image_urls'] = info['imageURL'] if 'imageURL' in info.keys( ) else None pipleitem['video_urls'] = info[ 'requestURL'] if 'requestURL' in info.keys() else None pipleitem['share'] = None pipleitem['like'] = None pipleitem['dislike'] = None pipleitem['views'] = None pipleitem['comment'] = None pipleitem['crawl_time'] = helper.get_localtimestamp() return pipleitem
def content_parse(self, response): date = response.meta['newsTime'] if 'newsTime' in response.meta.keys() else None if date == None or len(date) == 0: return try: if helper.compare_time(date, self.limittime) < 0: return except: return pipleitem = CctvOpinionmonitor3Item() pipleitem['date'] = helper.formatTime(date) pipleitem['id'] = response.meta['newsId'] if 'newsId' in response.meta.keys() else None pipleitem['url'] = response.url pipleitem['title'] = response.css('title::text').extract_first() pipleitem['source'] = response.meta['newsResource'] if 'newsResource' in response.meta.keys() else None pipleitem['editor'] = None pipleitem['content'] = helper.list2str(response.xpath('string(//div[@class="m_details-con"])').extract()) pipleitem['image_urls'] = helper.list2str(response.meta['picUrlList'] if 'picUrlList' in response.meta.keys() else []) pipleitem['video_urls'] = response.css('audio::attr(src)').extract_first() pipleitem['share'] = None pipleitem['like'] = None pipleitem['dislike'] = None pipleitem['views'] = response.meta['articleCount'] if 'articleCount' in response.meta.keys() else None pipleitem['comment'] = response.meta['commentNum'] if 'commentNum' in response.meta.keys() else None pipleitem['crawl_time'] = helper.get_localtimestamp() return pipleitem
def video_parse(self, response): date = response.xpath( '//meta[@name="pubdate"]/@content').extract_first() if date == None or len(date) == 0: return try: if helper.compare_time(helper.formatTime2(date.strip()), self.limittime) < 0: return except: return pipleitem = CctvOpinionmonitor3Item() pipleitem['date'] = helper.formatTime(date.strip()) id = re.findall('com/(.*)/', response.url) pipleitem['id'] = id[0] if id != None and len(id) > 0 else None pipleitem['url'] = response.url pipleitem['title'] = response.css('title::text').extract_first() pipleitem['source'] = 'CNN' pipleitem['editor'] = response.xpath( '//meta[@name="author"]/@content').extract_first() pipleitem['content'] = helper.list2str( response.css('#[id~=js-video_description]::text').extract()) pipleitem['image_urls'] = helper.list2str( response.css('img::attr(src)').extract()) pipleitem['video_urls'] = helper.list2str( re.findall('"videoUrl":\s*"(.*?)"', response.text)) pipleitem['share'] = None pipleitem['like'] = None pipleitem['dislike'] = None pipleitem['views'] = None pipleitem['comment'] = None pipleitem['crawl_time'] = helper.get_localtimestamp() return pipleitem
def content_parse(self, response): jsonbd = re.findall('\{.*}', response.text) jsonbd = json.loads(jsonbd[0]) if len(jsonbd) > 0 else '{}' if len(jsonbd) == 0: return date = jsonbd['releasedate'] if 'releasedate' in jsonbd.keys( ) else None if date == None: return try: if helper.compare_time(date, self.limittime) < 0: return except: return pipleitem = CctvOpinionmonitor3Item() pipleitem['date'] = date pipleitem['id'] = jsonbd['id'] if 'id' in jsonbd.keys() else None pipleitem['url'] = response.url pipleitem['title'] = jsonbd['topic'] if 'topic' in jsonbd.keys( ) else None pipleitem['source'] = jsonbd[ 'docSource'] if 'docSource' in jsonbd.keys() else None pipleitem['content'] = helper.list2str( re.findall('>(.*?)<', jsonbd['content']) if 'content' in jsonbd.keys() and jsonbd['content'] != None else '') editor = re.findall('编辑:(.*?)<', pipleitem['content']) pipleitem['editor'] = editor[0] if len(editor) > 0 else None try: html = requests.get( 'http://xhpfmapi.zhongguowangshi.com/v600/news/detail.js?docid={}&share=1' .format(jsonbd['id'])) views = re.findall('"pvs":(\d*)', html.text) except: pass pipleitem['views'] = views[0] if len(views) > 0 else None imageList = [] for i in jsonbd['contentimglist'] if 'contentimglist' in jsonbd.keys( ) and jsonbd['contentimglist'] != None else []: imageList.append(i['smallImageLink']) pipleitem['image_urls'] = helper.list2str(imageList) pipleitem['video_urls'] = response.meta[ 'videourl'] if 'videourl' in response.meta.keys() else None pipleitem['share'] = jsonbd['share'] if 'share' in jsonbd.keys( ) else None pipleitem['like'] = jsonbd[ 'supportcount'] if 'supportcount' in jsonbd.keys() else None pipleitem['dislike'] = None pipleitem['comment'] = jsonbd['comment'] if 'comment' in jsonbd.keys( ) else None pipleitem['crawl_time'] = helper.get_localtimestamp() return pipleitem
def content_parse(self, response): jsonbd = json.loads(response.text) if len(jsonbd) == 0: return cards = jsonbd[0] date = cards['time'] if 'time' in cards.keys() else None if date == None or len(date) == 0: return try: if helper.compare_time(date, self.limittime) < 0: return except: return pipleitem = CctvOpinionmonitor3Item() pipleitem['date'] = helper.formatTime(date) pipleitem['id'] = cards['GlobalID'] if 'GlobalID' in cards.keys( ) else None pipleitem['url'] = response.url pipleitem['title'] = cards['title'] if 'title' in cards.keys( ) else None pipleitem['source'] = cards['docfrom'] if 'docfrom' in cards.keys( ) else None pipleitem['editor'] = cards['autor'] if 'autor' in cards.keys( ) else None pipleitem['content'] = cards['summary'] if 'summary' in cards.keys( ) else None pipleitem['image_urls'] = cards['photo'] if 'photo' in cards.keys( ) else None pipleitem['video_urls'] = cards[ 'videourl'] if 'videourl' in cards.keys() else None html = requests.get(url=self.url.format(pipleitem['id'])) bd = json.loads(html.text) if 'result' not in bd.keys() or len(bd['result']) == 0: bd = None else: bd = bd['result'][0] pipleitem['share'] = None pipleitem['like'] = None pipleitem['dislike'] = None pipleitem['views'] = bd['views'] if bd != None and 'views' in bd.keys( ) else None pipleitem['comment'] = bd[ 'commentnum'] if bd != None and 'commentnum' in bd.keys() else None pipleitem['crawl_time'] = helper.get_localtimestamp() # 'http://apiapp.people.cn/apiv3.3.0/get_article_info.php?globalids=1061648&parents=0,0,0,0,0,0&juxian_liveid=0,0,0,0,0,0&juxian_companyid=0,0,0,0,0,0&deviceid=A000009114F247' return pipleitem
def content_parse(self, response): jsonbd = json.loads(response.text) if 'cardgroups' not in jsonbd.keys() or len(jsonbd['cardgroups']) == 0: return cards = jsonbd['cardgroups'][0] if len( jsonbd['cardgroups']) != 0 else {} cards = cards['cards'][0] if len(cards['cards']) != 0 else {} date = helper.formatTime( cards['date']) if 'date' in cards.keys() else None if date == None or len(date) == 0: return try: if helper.compare_time(date, self.limittime) < 0: return except: return pipleitem = CctvOpinionmonitor3Item() pipleitem['date'] = date pipleitem['id'] = response.meta['id'] pipleitem['url'] = response.url pipleitem['title'] = cards['title'] if 'title' in cards.keys( ) else None pipleitem['source'] = cards['source'] if 'source' in cards.keys( ) else None pipleitem['editor'] = None pipleitem['content'] = helper.list2str( re.findall('>(.*?)<', cards['content'] if 'content' in cards.keys() else '')) pipleitem['image_urls'] = helper.list2str( cards['photoList'] if 'photoList' in cards.keys() else []) pipleitem['video_urls'] = cards['video'][ 'url'] if 'video' in cards.keys() else None pipleitem['share'] = None pipleitem['like'] = None pipleitem['dislike'] = None pipleitem['views'] = None pipleitem['comment'] = None pipleitem['crawl_time'] = helper.get_localtimestamp() return pipleitem
def content_parse(self, response): date = response.css('span.date::text').extract_first() if date == None or len(date) == 0: return try: if helper.compare_time(helper.formatTime(date.strip()), self.limittime) < 0: return except: return pipleitem = CctvOpinionmonitor3Item() pipleitem['date'] = helper.formatTime(date.strip()) id = re.findall('news/([a-z\d]*)',response.url) pipleitem['id'] = id[0] if id != None and len(id) > 0 else None pipleitem['url'] = response.url pipleitem['title'] = response.css('title::text').extract_first() pipleitem['source'] = 'CGTN' pipleitem['editor'] = response.xpath('//div[@class="news-author news-text"]/text()').extract_first() content = helper.list2str(response.xpath('string(//div[@id="cmsMainContent"])').extract()) pipleitem['content'] = content if len(content) > 10 else helper.list2str(response.css('#cmsMainContent::attr(data-json)').extract()) pipleitem['image_urls'] = helper.list2str(response.css('.cg-padding img::attr(src)').extract()) list = [] if response.meta['key'] == 'live': html = requests.get('https://mapi.cgtn.com/mobileapp/v2/live/event/info?id={}'.format(pipleitem['id'])).text for i in json.loads(html)['response']['videos']: list.append(i['url']) pipleitem['video_urls'] = helper.list2str(list) pipleitem['share'] = None pipleitem['like'] = None pipleitem['dislike'] = None pipleitem['views'] = None pipleitem['comment'] = None pipleitem['crawl_time'] = helper.get_localtimestamp() return pipleitem