def tryparse(self, str): #转换编码格式 strjson = str.decode("UTF-8", "ignore") #转json对象 strjson = json.loads(strjson) url = strjson['url'] result = urllib.parse.urlparse(url) params = urllib.parse.parse_qs(result.query, True) category = "" #类型 #区分栏目 if url.find('api.3g.ifeng.com/get_pic_list?channel=news') > -1: category = "图片" categorytag = self.categroytag["%s" % category] elif url.find('api.iclient.ifeng.com/ClientNews') > -1: category = params['id'][0] if category == "SYLB10,SYDT10,SYRECOMMEND" or category == "SYLB10,SYDT10": category = "头条" categorytag = self.categroytag["%s" % category] elif category == "RECOMVIDEO": category = "视频" categorytag = self.categroytag["%s" % category] elif category == "YAOWEN223": category = "要闻" categorytag = self.categroytag["%s" % category] elif category == "VIDEOSHORT": category = "小视频" categorytag = self.categroytag["%s" % category] else: SingleLogger().log.debug("有不正确的url1") return else: SingleLogger().log.debug("有不正确的url2") return crawltime = strjson['time'] #获取data data = strjson['data'] data = json.loads(data) lable = "" #如果是“图片”栏目,取item方式不一样 if category == "图片": item = data['body']['item'] for y, x in enumerate(item): self.Analysis_fenghuang(x, category, crawltime, y, categorytag, lable) else: for y1, curobj1 in enumerate(data): item = curobj1['item'] lable = curobj1['type'] if lable == "top": lable = "置顶" else: lable = "" for y2, curobj2 in enumerate(item): self.Analysis_fenghuang(curobj2, category, crawltime, y2, categorytag, lable)
def tryparse(self, str): # 转换编码格式 strjson = str.decode("UTF-8", "ignore") # 转json对象 strjson = json.loads(strjson) url = strjson['url'] result = urllib.parse.urlparse(url) params = urllib.parse.parse_qs(result.query, True) crawltime = strjson['time'] # 区分栏目 category = "" # 类型 try: category = params['channel'][0] except: SingleLogger().log.debug("=====category======>%s" % params) if category == "news_jingyao": category = "要闻" categorytag = self.categroytag["%s" % category] elif category == "news_toutiao": category = "推荐" categorytag = self.categroytag["%s" % category] elif category == "news_video": category = "视频" categorytag = self.categroytag["%s" % category] elif category == "news_pic": category = "图片" categorytag = self.categroytag["%s" % category] else: SingleLogger().log.debug("=====有不正确的栏目======>%s" % category) SingleLogger().log.debug("有不正确的栏目") return # 获取data data = strjson['data'] data = json.loads(data) # feed if data['data']['feed'] and data['data']['feed'] != '': feed = data['data']['feed'] for y1, curobj1 in enumerate(feed): self.Analysis_sina(curobj1, category, crawltime, y1, categorytag) # ad try: if data['data']['ad']['feed'] and data['data']['ad']['feed'] != '': ad = data['data']['ad']['feed'] for y2, curobj2 in enumerate(ad): self.Analysis_sina(curobj2, category, crawltime, y2, categorytag) except: None
def getHtmlImages(self, url): html = Http.get(url) soup = BeautifulSoup(html, "html.parser") # 文档对象 imgStr = "" for k in soup.find_all('img'): # 获取img try: imgStr += k['data-src'] + "," except: SingleLogger().log.debug("没有找到标签") try: imgStr += k['src'] + "," except: SingleLogger().log.debug("没有找到src标签") return imgStr
def tryparse(self, str): #转换编码格式 strjson = str.decode("UTF-8", "ignore") #转json对象 strjson = json.loads(strjson) url = strjson['url'] result = urllib.parse.urlparse(url) params = urllib.parse.parse_qs(result.query, True) category = "" #类型 try: category = params['chlid'][0] if category == "news_news_top": category = "要闻" categorytag = self.categroytag["%s" % category] elif category == "news_news_lianghui": category = "两会" categorytag = self.categroytag["%s" % category] elif category == "news_video_top": category = "视频" categorytag = self.categroytag["%s" % category] elif category == "news_video_main": category = "图片" categorytag = self.categroytag["%s" % category] else: SingleLogger().log.debug("不在4种类型之内") return except: SingleLogger().log.debug("无类型") return crawltime = strjson['time'] #获取data data = strjson['data'] try: data = json.loads(data) except: None #防止报错 try: #如果是“两会”,取newslist方式不一样 if category == "两会": list = data['idlist'][0]['newslist'] else: list = data['newslist'] except: return for y, x in enumerate(list): self.Analysis_ten(x, category, crawltime, y, categorytag)
def tryparse(self, str): # 转换编码格式 strjson = str.decode("UTF-8", "ignore") # 转json对象 strjson = json.loads(strjson) url = strjson['url'] result = urllib.parse.urlparse(url) params = urllib.parse.parse_qs(result.query, True) if url.find('news-list-for-best-channel') > -1: category = "推荐" categorytag = self.categroytag["%s" % category] elif url.find('news-list-for-hot-channel') > -1: category = "要闻" categorytag = self.categroytag["%s" % category] elif url.find('news-list-for-channel') > -1: channel_id = params['channel_id'][0] if channel_id == "21044074964": category = "美图" categorytag = self.categroytag["%s" % category] elif channel_id == "21044074724": category = "视频" categorytag = self.categroytag["%s" % category] elif channel_id == "21044074756": category = "图片" categorytag = self.categroytag["%s" % category] else: SingleLogger().log.debug(url) return else: SingleLogger().log.debug(url) return crawltime = int(strjson['time']) # 获取data data = strjson['data'] data = json.loads(data) list = data['result'] datalen = len(list) for y, x in enumerate(list): if category == "要闻" or category == "图片": if datalen == y + 1: continue elif category == "视频" or category == "美图": if y == 0: continue self.Analysis_ydzx(x, category, crawltime, y, categorytag)
def get(url,referer=None): headers = { 'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_13_5) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/67.0.3396.79 Safari/537.36', 'Accept': 'text/html;q=0.9,*/*;q=0.8', 'Accept-Charset': 'ISO-8859-1,utf-8;q=0.7,*;q=0.3', 'Accept-Encoding': 'gzip', 'Connection': 'close', 'referer':referer } response = requests.get(url,headers=headers) if response.status_code == 200 : SingleLogger().log.debug("==========response.text===========>%s" % response.text) SingleLogger().log.debug("==========response.content===========>%s" % response.content) return response.content else: return "" pass
def tryparse(self,str): #转换编码格式 strjson = str.decode("UTF-8","ignore") #转json对象 strjson = json.loads(strjson) url = strjson['url'] result = urllib.parse.urlparse(url) params = urllib.parse.parse_qs(result.query,True) channelId = params['channelId'][0] if url.find('v6/news.go') > -1: if channelId == "1": category ="要闻" categorytag = self.categroytag["%s" % category] elif channelId == "13557": category = "推荐" categorytag = self.categroytag["%s" % category] else: SingleLogger().log.debug(url) return elif url.find('v5/news.go') > -1: if channelId == "4313": category = "两会" categorytag = self.categroytag["%s" % category] else: SingleLogger().log.debug(url) return else: SingleLogger().log.debug(url) return crawltime = strjson['time'] #获取data data = strjson['data'] data = json.loads(data) list = data['recommendArticles'] for y,x in enumerate(list): self.Analysis_shxw(x,category,crawltime,y,categorytag) if category == "要闻": list = data['trainArticles']['trainList'] for y,x in enumerate(list): self.Analysis_shxw(x,category,crawltime,y,categorytag)
def tryparse(self, str): # 转换编码格式 strjson = str.decode("UTF-8", "ignore") # 转json对象 strjson = json.loads(strjson) url = strjson['url'] #暂时只抓取一个栏目,写死 category = "推荐" categorytag = self.categroytag["%s" % category] crawltime = strjson['time'] # 获取data data = strjson['data'] try: data = json.loads(data) list = data['data'] for y, x in enumerate(list): self.Analysis_bdxw(x, category, crawltime, y, categorytag) except: SingleLogger().log.debug("抓取数据不正常")
def tryparse(self, str): # 转换编码格式 strjson = str.decode("UTF-8", "ignore") # 转json对象 strjson = json.loads(strjson) url = strjson['url'] result = urllib.parse.urlparse(url) params = urllib.parse.parse_qs(result.query, True) category = "" #类型 try: category = params['category'][0] if category == "news_hot": category = "热点" categorytag = self.categroytag["%s" % category] elif category == "hotsoon_video": category = "小视频" categorytag = self.categroytag["%s" % category] elif category == "video": category = "视频" categorytag = self.categroytag["%s" % category] elif category == "组图": category = "图片" categorytag = self.categroytag["%s" % category] elif category == "image_wonderful": category = "美图" categorytag = self.categroytag["%s" % category] except: if url.find('wenda/v1/native/feedbrow') > -1: category = "问答" categorytag = self.categroytag["%s" % category] else: category = "推荐" categorytag = self.categroytag["%s" % category] SingleLogger().log.debug("无类型") if category != "两会" and category != "问答" and category != "热点" and category != "视频" and category != "小视频" and category != "推荐" and category != "图片" and category != "美图": return crawltime = strjson['time'] #获取data data = strjson['data'] data = json.loads(data) list = data['data'] for y, x in enumerate(list): self.Analysis_sntt(x, category, crawltime, y, categorytag)
def tryparse(self, str): # 转换编码格式 strjson = str.decode("UTF-8", "ignore") # 转json对象 strjson = json.loads(strjson) url = strjson['url'] if url.find('api/feed_feedlist') > -1: category = "推荐" categorytag = self.categroytag["%s" % category] elif url.find('api/newchosenlist') > -1: category = "视频" categorytag = self.categroytag["%s" % category] elif url.find('api/newslist') > -1: category = "两会" categorytag = self.categroytag["%s" % category] elif url.find('api/medianewslist') > -1: category = "图片" categorytag = self.categroytag["%s" % category] else: SingleLogger().log.debug(url) return crawltime = strjson['time'] # 获取data data = strjson['data'] data = json.loads(data) list = data['data'] if category == "推荐": data = list['top'] for y, x in enumerate(data): self.Analysis_bdxw(x, category, crawltime, y,categorytag) data = list['news'] for y, x in enumerate(data): self.Analysis_bdxw(x, category, crawltime, y,categorytag) else: data = list['news'] datalen = len(data) for y, x in enumerate(data): # 若类型等于图片时,则最后一次循环时进行跳出 if category == "图片": if datalen == y + 1: return self.Analysis_bdxw(x, category, crawltime, y,categorytag)
def tryparse(self, str): # 转换编码格式 strjson = str.decode("UTF-8", "ignore") # 转json对象 strjson = json.loads(strjson) url = strjson['url'] if url.find('/aweme/v1/feed/') > -1: category = "推荐" categorytag = self.categroytag["%s" % category] elif url.find('/aweme/v1/nearby/feed/') > -1: category = "附近" categorytag = self.categroytag["%s" % category] else: SingleLogger().log.debug(url) return crawltime = strjson['time'] # 获取data data = strjson['data'] data = json.loads(data) list = data['aweme_list'] for y, x in enumerate(list): self.Analysis_bdxw(x, category, crawltime, y, categorytag)
def tryparse(self, str): #转换编码格式 strjson = str.decode("UTF-8", "ignore") #转json对象 strjson = json.loads(strjson) url = strjson['url'] if url.find('rest/n/feed/hot') > -1: category = "发现" categorytag = self.categroytag["%s" % category] else: SingleLogger().log.debug(url) return crawltime = strjson['time'] #获取data data = strjson['data'] try: data = json.loads(data) except: None list = data['feeds'] for y, x in enumerate(list): self.Analysis_ks(x, category, crawltime, y, categorytag)
def Analysis_bdxw(self, data, category, crawltime, y, categorytag): seq = y + 1 # 排序 title = "" # 标题 articleid = "" # 文章标识 restype = 1 # 类型 1 图文 2 图片 3 视频 logo = "" # 图片 source = "" # 来源 abstract = "" # 摘要 tab = "" # 标签 gallary = "" #详情图片,视频 content = "" # 内容 audio = '' #音频 video = '' #视频 try: corner_type = data['tips'] if corner_type == "": restype = 1 content = self.getHtmlBodyInnerText(data['share_url']) elif corner_type == "视频": restype = 3 content = self.getHtmlVideos(data['share_url']) video = content elif corner_type == "广告": return except: SingleLogger().log.debug("非视频/图片资讯") title = data['title'] abstract = data['introduction'] url = data['share_url'] source = data['source_name'] articleid = data['id'] publish_time = data['publish_time'] img_url = data['cover'] for i in img_url: if i != "": logo += i + "," gallary = self.getHtmlImages(url) # 判断图末尾是否为,若是则进行删除 gallarylen = len(gallary) if gallarylen > 0: gallarystr = gallary[gallarylen - 1] if gallarystr == ",": gallary = gallary[:-1] # 当等于视频时已得到视频地址 无需调用 if restype != 3: video += self.getHtmlVideos(url) crawltimestr = time.strftime("%Y-%m-%d %H:%M:%S", time.localtime(crawltime / 1000)) publish_timestr = time.strftime( "%Y-%m-%d %H:%M:%S", time.localtime(int(publish_time) / 1000)) SingleLogger().log.debug(title) # 判断列表封面图末尾是否为,若是则进行删除 logolen = len(logo) if logolen > 0: logostr = logo[logolen - 1] if logostr == ",": logo = logo[:-1] sdata = { "title": title, "description": abstract, "content": content, "source": source, "pubtimestr": publish_timestr, "pubtime": publish_time, "crawltimestr": crawltimestr, "crawltime": crawltime, "status": 0, "shorturl": url, "logo": logo, "labels": tab, "keyword": "", "seq": seq, "identity": str(articleid), "appname": self.appname, "app_tag": self.apptag, "category_tag": categorytag, "category": category, "restype": restype, "gallary": gallary, #里面的所有图片地址 "video": video, "audio": audio } self.db(sdata, articleid, title)
def Analysis_wyxw(self, data, category, crawltime, y,categorytag): title = "" # 标题 articleid = "" # 文章标识 restype = 1 # 类型 1 图文 2 图片 3 视频 logo = "" # 图片 source = "" # 来源 abstract = "" # 摘要 content = "" # 内容 gallary = "" tab = "" # 标签 video = '' # 视频 audio = '' # 音频 try: title = data['title'] except: SingleLogger().log.debug("无title") publish_time = "" # 发布时间 publish_timestr = "" # 发布时间戳 if category == "视频": restype = 3 abstract = data['description'] logo = data['cover'] source = data['topicName'] articleid = data['vid'] elif category == "图片": abstract = data['desc'] publish_timestr = data['createdate'] timeArray = time.strptime(publish_timestr, "%Y-%m-%d %H:%M:%S") publish_time = int(time.mktime(timeArray)) title = data['setname'] img_list = data['pics'] for i in img_list: if i != "": logo += i + "," url = data['seturl'] articleid = data['setid'] restype = 2 else: try: abstract = data['digest'] except: SingleLogger().log.debug("无digest") try: logo = data['imgsrc'] except: SingleLogger().log.debug("无imgsrc") try: source = data['source'] except: SingleLogger().log.debug("无source") try: if category == "问吧": articleid = data['docid'] else: articleid = data['id'] except: SingleLogger().log.debug("无id") # 若唯一标识为空,则获取图片唯一标识,此资讯为图片资讯 if articleid == "": articleid = data['photosetID'] restype = 2 try: TAG = data['TAG'] if TAG == "视频": restype = 3 except: SingleLogger().log.debug("无TAG") try: img_list = data['imgnewextra'] for z in img_list: if z['imgsrc'] != "": logo += "," + z['imgsrc'] except: SingleLogger().log.debug("仅一张或没有图片") try: tab = data['interest'] if tab == "S": tab = "置顶" except: SingleLogger().log.debug("无interest") if category == "热点": try: tab = data['recReason'] if tab == "大家都在看": tab = "热" except: SingleLogger().log.debug("无recReason") seq = y + 1 # 排序 if publish_timestr == "": try: publish_timestr = data['ptime'] timeArray = time.strptime(publish_timestr, "%Y-%m-%d %H:%M:%S") publish_time = int(time.mktime(timeArray)) except: try: publish_time = data['recTime'] publish_timestr = time.strftime("%Y-%m-%d %H:%M:%S", time.localtime(publish_time)) except: SingleLogger().log.debug("无recTime、ptime") crawltimestr = time.strftime("%Y-%m-%d %H:%M:%S", time.localtime(crawltime / 1000)) # 拼链接地址 news_detail_url = 'https://c.m.163.com/nc/article/' + str(articleid) + '/full.html' headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; WOW64; rv:23.0) Gecko/20100101 Firefox/23.0'} # 若restype=3为视频资讯 取视频地址 if restype == 3: url = 'https://c.m.163.com/nc/video/detail/' + str(articleid) + '.html' if category == "视频": content = data['mp4_url'] elif category == "热点": news_detail = requests.get(url, headers=headers).json() content = news_detail['mp4_url'] else: content = data['videoinfo']['mp4_url'] video=content elif restype == 2: if category == "图片": strarr = url.split('/') first = strarr[4][-4:] second = articleid else: strarr = articleid.split('|') first = strarr[0][-4:] second = strarr[1] news_detail_url = 'https://c.m.163.com/photo/api/set/' + str(first) + '/' + str(second) + '.json' news_detail = requests.get(news_detail_url, headers=headers).json() url = news_detail['url'] tdata = news_detail['photos'] for t in tdata: if t['imgurl'] != "": gallary += t['imgurl'] + "," content += t['note'] + "<br/>" elif restype == 1: if category == "问吧": news_detail_url = 'https://wenba.m.163.com/wenda/mob/answer/detail.do?uuid=' + str(articleid) news_detail = requests.get(news_detail_url, headers=headers).json()['data'] content = news_detail['answer']['content'] # 读取问答图片 image_list = news_detail['answer']['images'] for i in image_list: if i['src'] != "": gallary += i['src'] + "," url = "https://c.m.163.com/news/ans/" + articleid + ".html" # 跟帖接口https://comment.api.163.com/api/v1/products/a2869674571f77b5a0867c3d71db5856/threads/@replyid字段值/app/comments/newList else: try: news_detail = requests.get(news_detail_url, headers=headers) news_detail = news_detail.json()[str(articleid)] except: time.sleep(20) news_detail = requests.get(news_detail_url, headers=headers).json()[str(articleid)] # 读取资讯图片 image_list = news_detail['img'] for i in image_list: if i['src'] != "": gallary += i['src'] + "," # 读取资讯视频 try: video_list = news_detail['video'] for v in video_list: if v['url_mp4'] != "": video += v['url_mp4'] + "," except: SingleLogger().log.debug("无视频") content = news_detail['body'] # 读取 try: spinfo_list = news_detail['spinfo'] for s in spinfo_list: if s['spcontent'] != "": content += s['spcontent'] except: None url = news_detail['shareLink'] sdata = { "title": title, "description": abstract, "content": content, "source": source, "pubtimestr": publish_timestr, "pubtime": publish_time, "crawltimestr": crawltimestr, "crawltime": crawltime, "status": 0, "shorturl": url, "logo": logo, "labels": tab, "keyword": "", "seq": seq, "identity": str(articleid), "appname": self.appname, "app_tag": self.apptag, "category_tag":categorytag, "category": category, "restype": restype, "gallary": gallary, "video": video, "audio": audio } self.db(sdata, articleid, title)
def Analysis_dftt(self, data, category, crawltime, y, categorytag): seq = y + 1 # 排序 title = "" # 标题 articleid = "" # 文章标识 restype = 1 # 类型 1 图文 2 图片 3 视频 logo = "" # 图片 source = "" # 来源 abstract = "" # 摘要 tab = "" # 标签 gallary = "" content = "" # 内容 video = '' # 视频 audio = '' #音频 articleid=data['rowkey'] title=data['topic'] try: hotnews=data['hotnews'] if hotnews=="1": tab='热门' except: SingleLogger().log.debug('非热门') try: issptopic=data['issptopic'] if issptopic=="1": if tab=='': tab='专题' else: tab+=',专题' except: SingleLogger().log.debug('非专题') try: imglist=data['miniimg'] for i in imglist: if i['src'] != "": logo += i['src'] + "," except: SingleLogger().log.debug('无封面图') try: url=data['shareurl'] except: url = data['url'] source=data['source'] # 判断列表封面图末尾是否为,若是则进行删除 logolen = len(logo) if logolen > 0: logostr = logo[logolen - 1] if logostr == ",": logo = logo[:-1] publish_time=data['ctrtime'] crawltimestr = time.strftime("%Y-%m-%d %H:%M:%S", time.localtime(crawltime / 1000)) publish_timestr = time.strftime("%Y-%m-%d %H:%M:%S", time.localtime(int(publish_time) / 1000)) # 滚动视频 additional02="" try: additional02 = data['additional02'] except: SingleLogger().log.debug('非滚动视频') if len(additional02)>0: for i in additional02: imglist = i['imgjs'] for i in imglist: if i['src'] != "": logo += i['src'] + "," source = i['source'] videos = i['videojs'] for v in videos: if v['src'] != "": video += v['src'] + "," content += v['src'] + "," articleid = i['8413148441964056151'] sdata = { "title": title, "description": abstract, "content": content, "source": source, "pubtimestr": publish_timestr, "pubtime": publish_time, "crawltimestr": crawltimestr, "crawltime": crawltime, "status": 0, "shorturl": url, "logo": logo, "labels": tab, "keyword": "", "seq": seq, "identity": str(articleid), "appname": self.appname, "app_tag": self.apptag, "category_tag": categorytag, "category": category, "restype": restype, "gallary": gallary, "video": video, "audio": audio } self.db(sdata, articleid, title) else : isvideo = data['videonews'] if isvideo == "1": restype = 3 content = data['video_link'] video = data['video_link'] #普通资讯 if restype == 1: if tab.find('专题')>-1: content=url else: dataurl=url+"" try: gallary = self.getHtmlImages(url) except: SingleLogger().log.debug("没有gallary") try: content = self.getHtmlBodyInnerText(url) except: SingleLogger().log.debug("没有图文详情") try: videos = self.getHtmlVideos(url) if videos != '': video += videos except: SingleLogger().log.debug("详情没有video") sdata = { "title": title, "description": abstract, "content": content, "source": source, "pubtimestr": publish_timestr, "pubtime": publish_time, "crawltimestr": crawltimestr, "crawltime": crawltime, "status": 0, "shorturl": url, "logo": logo, "labels": tab, "keyword": "", "seq": seq, "identity": str(articleid), "appname": self.appname, "app_tag": self.apptag, "category_tag":categorytag, "category": category, "restype": restype, "gallary": gallary, "video": video, "audio": audio } self.db(sdata, articleid, title)
def tryparse(self, str): # 转换编码格式 strjson = str.decode("UTF-8", "ignore") wdid = "" # 转json对象 strjson = json.loads(strjson) url = strjson['url'] result = urllib.parse.urlparse(url) params = urllib.parse.parse_qs(result.query, True) if url.find('recommend/getSubDocPic') > -1: try: category = params['from'][0] if category == "toutiao": category = "头条" categorytag = self.categroytag["%s" % category] except: category = "热点" categorytag = self.categroytag["%s" % category] elif url.find('recommend/getChanListNews') > -1: category = "视频" categorytag = self.categroytag["%s" % category] elif url.find('recommend/getComRecNews') > -1: wdidstr = url.split('?')[0].split('/') wdid = wdidstr[5] category = "问吧" categorytag = self.categroytag["%s" % category] elif url.find('recommend/useraction') > -1: if url.find('recommend/useraction?info=') > -1: SingleLogger().log.debug(url) return category = "两会" categorytag = self.categroytag["%s" % category] elif url.find('photo/api') > -1: # photo/api/set 为图片详情 if url.find('photo/api/set') > -1: SingleLogger().log.debug(url) return category = "图片" categorytag = self.categroytag["%s" % category] else: return crawltime = strjson['time'] # 获取data data = strjson['data'] try: data = json.loads(data) except: SingleLogger().log.debug("无效抓取") return if category == "热点": list = data['推荐'] elif category == "头条": list = data['T1348647909107'] elif category == "视频": list = data['视频'] elif category == "图片": list = data elif category == "问吧": list = data[wdid] for y, x in enumerate(list): self.Analysis_wyxw(x, category, crawltime, y,categorytag)
def Analysis_sina(self, data, category, crawltime, y, categorytag): video = '' # 视频 audio = '' # 音频 title = "" # 标题 abstract = "" # 摘要 articleid = "" # 文章标识 tab = "" # 标签 source = "" # 来源 logo = "" # 列表图片 url = "" # 文章短地址 actionType = "" # 文章展示类型(2-普通 14-头条 3-视频 1-广告 2-图片) layoutStyle = "" # 布局样式 publish_time = "" # 发布时间 时间戳 publish_timestr = "" # 发布时间 标准时间str crawltimestr = "" # 抓包时间 restype = 1 # 类型 1 图文 2 图片 3 视频 keywords = "" # 关键字 content = "" # 内容 gallary = "" # 图片资讯图片地址 # 布局样式 20 频道样式 layoutStyle = data['layoutStyle'] # 文章展示类型(2-普通 14-头条 3-视频 1-广告 2-图片) actionType = data['actionType'] #若样式为20 无文章类型则为频道 且忽略此数据 if layoutStyle == 20 and actionType != '': return # 若样式为36 类型为23 则为要闻滚动 且忽略此数据 if layoutStyle == 36 and actionType != 23: return # 标题 try: if data['intro'] and data['intro'] != "": title = data['intro'] if data['longTitle'] and data['longTitle'] != "": title = data['longTitle'] elif data['title'] and data['title'] != "": title = data['title'] else: SingleLogger().log.debug('无标题1') except: SingleLogger().log.debug('无标题2') # 发布时间 try: publish_time = data['pubDate'] if publish_time and publish_time != "": publish_timestr = time.strftime("%Y-%m-%d %H:%M:%S", time.localtime(publish_time)) except: SingleLogger().log.debug("无发布时间") # 抓包时间 try: crawltimestr = time.strftime("%Y-%m-%d %H:%M:%S", time.localtime(crawltime / 1000)) # 抓包时间 except: SingleLogger().log.debug("获取抓包时间失败") # 来源 try: source = data['source'] except: SingleLogger().log.debug("没有来源") # 短地址 try: url = data['link'] except: SingleLogger().log.debug("无短地址") # 文章标识 try: articleid = data['newsId'] except: SingleLogger().log.debug("没有文章标识") # 文章摘要 try: abstract = data['intro'] except: SingleLogger().log.debug('无摘要') # 列表图logo try: images = data['pics']['list'] for imgobj in images: logo += imgobj['pic'] + "," except: logo = data['pic'] # 标签 try: tab = data['showTag'] except: SingleLogger().log.debug('无标签') # 分类处理 # 视频 if actionType == 3: SingleLogger().log.debug('视频') restype = 3 # 防止报错 try: videoInfo = data['videoInfo'] # 视频信息 logo = videoInfo['pic'] content = videoInfo['url'] video = content except: SingleLogger().log.debug("获取视频详情失败") # 图片 elif actionType == 6: SingleLogger().log.debug('图片') restype = 2 try: logo = data['pic'] images = data['pics']['list'] for imgobj in images: gallary += imgobj['pic'] + "," content += imgobj['alt'] + "<br>" except: SingleLogger().log.debug('获取图片详情失败') # 广告 elif actionType == 1 and layoutStyle == 3: SingleLogger().log.debug('广告') content = url # 明日头条 elif actionType == 14: SingleLogger().log.debug('明日头条') mrttList = data['mrttList'] title = mrttList[0]['alt'] logo = mrttList[0]['kpic'] articleid = mrttList[0]['newsId'] # 普通新闻 else: SingleLogger().log.debug('普通新闻') if tab.find('专题') > -1: content = url else: # 防止报错 if url != '': try: gallary = self.getHtmlImages(url) except: SingleLogger().log.debug("没有gallary") try: content = self.getHtmlBodyInnerText(url) except: SingleLogger().log.debug("没有图文详情") try: videos = self.getHtmlVideos(url) if videos != '': video += videos except: SingleLogger().log.debug("详情没有video") sdata = { "title": title, "description": abstract, "content": content, # "source": source, "pubtimestr": publish_timestr, "pubtime": publish_time, "crawltimestr": crawltimestr, # 抓包时间 "crawltime": crawltime, "status": 0, "shorturl": url, "logo": logo, "labels": tab, "keyword": "", "seq": y + 1, # 排序 "identity": str(articleid), "appname": self.appname, "app_tag": self.apptag, "category_tag": categorytag, "category": category, # 栏目 "restype": restype, # "gallary": gallary, #里面的所有图片地址 "video": video, "audio": audio } SingleLogger().log.debug("=====sina======>%s" % sdata) self.db(sdata, articleid, title)
def add_ydzx_db(self, data, category, crawltime, y, categorytag): seq = y + 1 # 排序 title = "" # 标题 articleid = "" # 文章标识 restype = 1 # 类型 1 图文 2 图片 3 视频 logo = "" # 图片 source = "" # 来源 abstract = "" # 摘要 tab = "" # 标签 gallary = "" IsArtID = False # 是否为广告资讯 content = "" # 内容 publish_timestr = "" publish_time = "" url = "" # 跳转地址 video = '' # 视频 audio = '' # 音频 title = data['title'] source = data['source'] try: abstract = data['summary'] except: SingleLogger().log.debug("无summary") try: articleid = data['docid'] except: SingleLogger().log.debug("广告资讯") articleid = data['aid'] if title == "": title = abstract try: image_list = data['image_urls'] for i in image_list: if i != "": logo += i + "," except: SingleLogger().log.debug("无图片") try: card_label = data['card_label']['text'] tab = card_label except: SingleLogger().log.debug("无标签") try: url = data['url'] except: SingleLogger().log.debug("无url") try: publish_timestr = data['date'] timeArray = time.strptime(publish_timestr, "%Y-%m-%d %H:%M:%S") publish_time = int(time.mktime(timeArray)) except: SingleLogger().log.debug("无时间") try: content_type = data['content_type'] if content_type == "video": restype = 3 content = data['video_url'] video = content elif content_type == "slides": restype = 2 gallery_items = data['gallery_items'] for g in gallery_items: if g['img'] != "": gallary += g['img'] + "," if g['desc'] != "": content += g['desc'] + "<br/>" elif content_type == "picture": logo = data['image'] except: ctype = data['ctype'] if ctype == "advertisement": IsArtID = True tab = data['tag'] SingleLogger().log.debug("广告") crawltimestr = time.strftime("%Y-%m-%d %H:%M:%S", time.localtime(crawltime / 1000)) # 拼链接地址 news_detail_url = 'https://a1.go2yd.com/Website/contents/content?docid=' + str( articleid) headers = { 'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; WOW64; rv:23.0) Gecko/20100101 Firefox/23.0' } # 若不为广告资讯或者视频资讯,取资讯详细信息 if IsArtID == False: if restype == 1: news_detail = requests.get(news_detail_url).json()['documents'] if category == "美图": news_title = news_detail[0]['title'] if news_title != "": title = news_title abstract = news_detail[0]['summary'] content = news_detail[0]['content'] # 判断列表封面图末尾是否为,若是则进行删除 logolen = len(logo) if logolen > 0: logostr = logo[logolen - 1] if logostr == ",": logo = logo[:-1] sdata = { "title": title, "description": abstract, "content": content, "source": source, "pubtimestr": publish_timestr, "pubtime": publish_time, "crawltimestr": crawltimestr, "crawltime": crawltime, "status": 0, "shorturl": url, "logo": logo, "labels": tab, "keyword": "", "seq": seq, "identity": str(articleid), "appname": self.appname, "app_tag": self.apptag, "category_tag": categorytag, "category": category, "restype": restype, "gallary": gallary, "video": video, "audio": audio } self.db(sdata, articleid, title)
def Analysis_shxw(self,data,category,crawltime,y,categorytag): video = '' # 视频 audio = '' # 音频 seq = y + 1#排序 title = ""#标题 articleid = ""#文章标识 restype = 1#类型 1 图文 2 图片 3 视频 logo = ""#图片 source = ""#来源 abstract = ""#摘要 tab = ""#标签 gallary = "" IsArtID = False#是否为广告资讯 content = ""#内容 publish_timestr = "" publish_time = "" url = ""#跳转地址 try: articleid = data['newsId'] except: SingleLogger().log.log.debug("广告") try: title = data['title'] except: SingleLogger().log.log.debug("无标题") try: source = data['media'] except: SingleLogger().log.debug("无来源") try: abstract = data['description'] except: SingleLogger().log.debug("无描述") try: tab = data['recomReasons'] except: SingleLogger().log.debug("无标签") try: img_list = data['pics'] for i in img_list: if i != "": logo+=i + "," except: SingleLogger().log.debug("无图片") templateType=data['templateType'] if templateType==14: IsArtID=True tab="广告" articleid=data['data']['adid'] title=data['data']['resource']['text'] logo=data['data']['resource1']['file'] source=data['data']['resource2']['text'] elif templateType==37: restype=3 try: publish_time = data['time'] publish_timestr = time.strftime("%Y-%m-%d %H:%M:%S", time.localtime(int(publish_time) / 1000)) except: SingleLogger().log.debug("无时间") crawltimestr = time.strftime("%Y-%m-%d %H:%M:%S", time.localtime(crawltime / 1000)) news_detail_url = 'https://zcache.k.sohu.com/api/news/cdn/v5/article.go/' + str(articleid) + '/0/0/0/3/1/18/40/5/1/1/1522743246021.json' headers = {'User-Agent':'Mozilla/5.0 (Windows NT 6.1; WOW64; rv:23.0) Gecko/20100101 Firefox/23.0'} if IsArtID==False: if restype == 3: vid = data['vid'] if vid!=0: news_detail_url = 'https://s1.api.tv.itc.cn/v4/video/info/' + str(vid) + '.json?site=2&api_key=695fe827ffeb7d74260a813025970bd5' news_detail = requests.get(news_detail_url,headers=headers).json() content = news_detail['data']['download_url'] url = news_detail['data']['url_html5'] video=content else: news_detail = requests.get(news_detail_url,headers=headers).json() content = news_detail['content'] gallary_list = news_detail['photos'] for g in gallary_list: if g['pic'] != "": gallary+=g['pic'] + "," tvInfos = news_detail['tvInfos'] for t in tvInfos: if t['tvUrl'] != "": #如果不为视频地址,则通过视频ID调用接口,返回视频地址 if t['tvUrl']=="urlNor&prod=news&prod=h5,": vid = t['vid'] news_detail_url = 'https://s1.api.tv.itc.cn/v4/video/info/' + str(vid) + '.json?site=2&api_key=695fe827ffeb7d74260a813025970bd5' news_detail = requests.get(news_detail_url,headers=headers).json() video+= news_detail['data']['download_url']+ "," else: video+=t['tvUrl'] + "," sdata = { "title": title, "description": abstract, "content": content, "source": source, "pubtimestr": publish_timestr, "pubtime": publish_time, "crawltimestr": crawltimestr, "crawltime": crawltime, "status": 0, "shorturl": url, "logo": logo, "labels": tab, "keyword": "", "seq": seq, "identity":str(articleid), "appname": self.appname, "app_tag": self.apptag, "category_tag": categorytag, "category": category, "restype": restype, "gallary": gallary, "video": video, "audio": audio } self.db(sdata,articleid,title)
def Analysis_bdxw(self, data, category, crawltime, y, categorytag): seq = y + 1 # 排序 title = "" # 标题 articleid = "" # 文章标识 restype = 1 # 类型 1 图文 2 图片 3 视频 logo = "" # 图片 source = "" # 来源 abstract = "" # 摘要 tab = "" # 标签 gallary = "" #文章中的图片 content = "" # 内容 video = '' # 视频 audio = '' # 音频 if data['type'] == 'RECOMMENDED_MESSAGE': articleid = data['id'] #属于资讯 try: #视频资讯 videofind = data['item']['video'] isvideo = 1 except: isvideo = 0 if isvideo == 1: #视频资讯 restype = 3 #封面图 logo = data['item']['video']['image']['picUrl'] # # 短地址 # if data['item']['linkInfo']: # url = data['item']['linkInfo']['originalLinkUrl'] # video = self.getVideo(url) # if video and video != '': # gallary += video +"," elif len(data['item']['pictures']) > 0: #不为空就是图文信息 restype = 2 #循环取封面图 for picUrl in data['item']['pictures']: logo += picUrl['picUrl'] + "," gallary += picUrl['picUrl'] + "," else: #纯文本 restype = 1 #标题 title = data['item']['topic']['content'] source = data['item']['topic']['content'] #内容 content = data['item']['content'] if restype == 3: video = content #时间 publish_timestr = data['item']['createdAt'][:-5].replace("T", " ") timeArray = time.strptime(publish_timestr, "%Y-%m-%d %H:%M:%S") publish_time = int(time.mktime(timeArray)) # string转成时间戳 crawltimestr = time.strftime("%Y-%m-%d %H:%M:%S", time.localtime(crawltime / 1000)) #短地址 if data['item']['linkInfo']: url = data['item']['linkInfo']['originalLinkUrl'] # 判断列表封面图末尾是否为,若是则进行删除 if len(logo) > 0: logostr = logo[len(logo) - 1] if logostr == ",": logo = logo[:-1] # 判断gallary末尾是否为,若是则进行删除 if len(gallary) > 0: logostr = gallary[len(gallary) - 1] if logostr == ",": gallary = gallary[:-1] else: return SingleLogger().log.debug(title) sdata = { "title": title, "description": abstract, "content": content, "source": source, "pubtimestr": publish_timestr, "pubtime": publish_time, "crawltimestr": crawltimestr, "crawltime": crawltime, "status": 0, "shorturl": url, "logo": logo, "labels": tab, "keyword": "", "seq": seq, "identity": str(articleid), "appname": self.appname, "app_tag": self.apptag, "category_tag": categorytag, "category": category, "restype": restype, "gallary": gallary, #里面的所有图片地址 "video": video, "audio": audio } self.db(sdata, articleid, title)
def Analysis_bdxw(self, data, category, crawltime, y, categorytag): seq = y + 1 # 排序 title = "" # 标题 articleid = "" # 文章标识 restype = 1 # 类型 1 图文 2 图片 3 视频 logo = "" # 图片 source = "" # 来源 abstract = "" # 摘要 tab = "" # 标签 gallary = "" content = "" # 内容 video = '' # 视频 audio = '' #音频 try: ctag = data['ctag']["name"] if ctag == "专题": return elif ctag == "置顶": tab = ctag except: SingleLogger().log.debug("无标签") title = data['title'] abstract = data['abs'] url = data['url'] source = data['site'] articleid = data['nid'] publish_time = data['sourcets'] img_url = data['imageurls'] for i in img_url: if i['url'] != "": logo += i['url'] + "," try: corner_type = data['corner_type'] if corner_type == "video": restype = 3 content = data['video']['url'] video=content elif corner_type == "image": restype = 2 except: SingleLogger().log.debug("非视频/图片资讯") if restype != 3: content_data = data['content'] for c in content_data: if c['type'] == "image": gallary += c['data']['original']['url'] + "," elif c['type'] == "text": content += c['data'] + "<br/>" crawltimestr = time.strftime("%Y-%m-%d %H:%M:%S", time.localtime(crawltime / 1000)) publish_timestr = time.strftime("%Y-%m-%d %H:%M:%S", time.localtime(int(publish_time) / 1000)) SingleLogger().log.debug(title) # 判断列表封面图末尾是否为,若是则进行删除 logolen = len(logo) if logolen > 0: logostr = logo[logolen - 1] if logostr == ",": logo = logo[:-1] sdata = { "title": title, "description": abstract, "content": content, "source": source, "pubtimestr": publish_timestr, "pubtime": publish_time, "crawltimestr": crawltimestr, "crawltime": crawltime, "status": 0, "shorturl": url, "logo": logo, "labels": tab, "keyword": "", "seq": seq, "identity": str(articleid), "appname": self.appname, "app_tag": self.apptag, "category_tag":categorytag, "category": category, "restype": restype, "gallary": gallary, "video": video, "audio": audio } self.db(sdata, articleid, title)
def Analysis_ten(self, data, category, crawltime, y, categorytag): video = '' # 视频 audio = '' # 音频 #标题 title = "" try: title = data['title'] #标题 except: SingleLogger().log.debug('无标题') #摘要 abstract = "" try: abstract = data['abstract'] #摘要 except: SingleLogger().log.debug("无摘要") #文章标识 articleid = "" try: articleid = data['id'] #文章标识 #如果文章标识为空,则跳出此循环 if articleid == "": return except: SingleLogger().log.debug("无文章标识") menulable = "" #当前栏目标签 try: menulable = data['uinname'] except: SingleLogger().log.debug("无栏目标签") #图片 logo = "" #来源 source = "" #资讯地址 url = "" try: source = data['source'] except: SingleLogger().log.debug("无来源") try: url = data['url'] #分享地址 if not (url) or url == "": url = data['short_url'] # 分享地址 if not (url) or url == "": url = data['surl'] # 分享地址 except: SingleLogger().log.debug("无资讯地址") #发布时间 时间戳 publish_time = "" #发布时间 标准时间 publish_timestr = "" try: publish_time = data['timestamp'] if publish_time and publish_time != "": publish_timestr = time.strftime("%Y-%m-%d %H:%M:%S", time.localtime(publish_time)) except: SingleLogger().log.debug("无发布时间") # 抓包时间 crawltimestr = time.strftime("%Y-%m-%d %H:%M:%S", time.localtime(crawltime / 1000)) # 抓包时间 #类型 1 图文 2 图片 3 视频 restype = 1 #排序 seq = y + 1 #关键字 keywords = "" #标签 tab = "" try: zdTab = data['labelList'][0]['word'] if tab == "": tab = zdTab else: tab += "," + zdTab except: SingleLogger().log.debug("无标签") try: zdTab = data['up_labelList'][0]['word'] if tab != '': tab += ',' + zdTab else: tab = zdTab except: SingleLogger().log.debug("无特殊标签") #文章展示类型(528-热点精选 88-问答) articletype = "" try: articletype = data['articletype'] except: SingleLogger().log.debug("无articletype") return #内容 content = "" #图片资讯图片地址 gallary = "" #列表图片展示类型(1-没图 0-1张小图 3-1张大图 2-3张小图) picShowType = "" try: picShowType = data['picShowType'] except: SingleLogger().log.debug("无picShowType") #logo 图片列表(除了视频板块) if picShowType == 1: #无图 #列表没有图 logo = "" else: #在thumbnails_qqnews数组里面取值 try: image_list = data['thumbnails_qqnews'] if not (image_list): image_list = data['thumbnails_qqnews_photo'] if not (image_list): image_list = data['thumbnails'] logo = "" for i in image_list: if i != "": logo += i + "," except: SingleLogger().log.debug("没有列表图,可能没有图或是视频") if articletype == "528" or articletype == "525": #528,525-热点精选 if tab == "": tab = "热点精选" else: tab += ",热点精选" try: #热点新闻,取里面第一个列表的 childList = data['newsModule']['newslist'][0] title = childList['title'] source = childList['source'] abstract = childList['abstract'] # 摘要 logo = childList['thumbnails_qqnews'][0] if not (logo): logo = childList['thumbnails_qqnews_photo'][0] if not (logo): logo = childList['thumbnails'][0] try: url = data['url'] # 分享地址 if not (url) or url == "": url = data['short_url'] # 分享地址 if not (url) or url == "": url = data['surl'] # 分享地址 except: SingleLogger().log.debug("无资讯地址") content = url except: SingleLogger().log.debug("该条热点消息无内容") elif articletype == "4" or articletype == "101": #视频新闻 4,101 restype = 3 #视频 try: videoData = data["video_channel"]["video"] logo = videoData["img"] content = videoData["playurl"] video = content except: SingleLogger().log.debug('无视频') elif articletype == "533": #直播 restype = 3 # 视频 if tab == "": tab = "直播" else: tab += ",直播" liveVideo = data["newsModule"]["newslist"][0] title = liveVideo['title'] source = liveVideo['source'] abstract = liveVideo['abstract'] logo = liveVideo['thumbnails_qqnews'][0] if not (logo): logo = liveVideo['thumbnails_qqnews_photo'][0] if not (logo): logo = liveVideo['thumbnails'][0] try: publish_time = liveVideo['timestamp'] # 发布时间 时间戳 if publish_time and publish_time != "": publish_timestr = time.strftime( "%Y-%m-%d %H:%M:%S", time.localtime(publish_time)) # 发布时间 标准时间 except: SingleLogger().log.debug("无发布时间") try: url = liveVideo['url'] # 分享地址 if not (url) or url == "": url = liveVideo['short_url'] # 分享地址 if not (url) or url == "": url = liveVideo['surl'] # 分享地址 try: video_channel = liveVideo['video_channel']['video'][ 'playurl'] content = video_channel except: content = url video = content except: SingleLogger().log.debug("无分享地址") elif articletype == "526": #标签列表,不是新闻return return # 普通新闻 图片新闻 elif articletype == "0" or articletype == "12" or articletype == "1": news_detail_url = 'http://r.inews.qq.com/getSimpleNews/1.3.1_qqnews_5.5.90/' + str( menulable) + '/' + str(articleid) news_detail = rq.get(news_detail_url).json() if articletype == "0" or articletype == "12": content = news_detail['content']['text'] attribute = news_detail['attribute'] for a in attribute: try: # 判断gallary是否存在此链接地址,存在则跳出此次循环,不存在则进行拼接 if gallary.find(attribute[a]["url"]) > -1: continue else: gallary += attribute[a]["url"] + "," except: try: video += attribute[a]["playurl"] + "," except: try: audio += attribute[a]["murl"] + "," except: SingleLogger().log.debug(json.dumps(attribute)) elif articletype == "1": restype = 2 #图片新闻 attribute = news_detail['attribute'] for a in attribute: try: # 判断gallary是否存在此链接地址,存在则跳出此次循环,不存在则进行拼接 if gallary.find(attribute[a]["url"]) > -1: continue else: gallary += attribute[a]["url"] + "," content += attribute[a]['desc'] + "<br/>" except: SingleLogger().log.debug(json.dumps(attribute)) #专题新闻 elif articletype == "100": content = url sdata = { "title": title, "description": abstract, "content": content, "source": source, "pubtimestr": publish_timestr, "pubtime": publish_time, "crawltimestr": crawltimestr, #抓包时间 "crawltime": crawltime, "status": 0, "shorturl": url, "logo": logo, "labels": tab, "keyword": "", "seq": seq, "identity": str(articleid), "appname": self.appname, "app_tag": self.apptag, "category_tag": categorytag, "category": category, #栏目 "restype": restype, #类型 "gallary": gallary, "video": video, "audio": audio } self.db(sdata, articleid, title)
def Analysis_bdxw(self, data, category, crawltime, y, categorytag): seq = y + 1 # 排序 title = "" # 标题 articleid = "" # 文章标识 restype = 1 # 类型 1 图文 2 图片 3 视频 logo = "" # 图片 source = "" # 来源 abstract = "" # 摘要 tab = "" # 标签 gallary = "" #详情图片,视频 content = "" # 内容 audio = '' #音频 url = data['url'] #头条栏目暂时看到的都是图文新闻 restype = 1 content = self.getHtmlBodyInnerText(url) title = data['topic'] source = data['source'] articleid = data['rowkey'] publish_time = data['date'] img_url = data['miniimg'] for i in img_url: if i['src'] != "": logo += i['src'] + "," gallary = self.getHtmlImages(url) video = self.getHtmlVideos(url) crawltimestr = time.strftime("%Y-%m-%d %H:%M:%S", time.localtime(crawltime / 1000)) publish_timestr = time.strftime( "%Y-%m-%d %H:%M:%S", time.localtime(int(publish_time) / 1000)) SingleLogger().log.debug(title) # 判断列表封面图末尾是否为,若是则进行删除 logolen = len(logo) if logolen > 0: logostr = logo[logolen - 1] if logostr == ",": logo = logo[:-1] sdata = { "title": title, "description": abstract, "content": content, "source": source, "pubtimestr": publish_timestr, "pubtime": publish_time, "crawltimestr": crawltimestr, "crawltime": crawltime, "status": 0, "shorturl": url, "logo": logo, "labels": tab, "keyword": "", "seq": seq, "identity": str(articleid), "appname": self.appname, "app_tag": self.apptag, "category_tag": categorytag, "category": category, "restype": restype, "gallary": gallary, #里面的所有图片地址 "video": video, "audio": audio } self.db(sdata, articleid, title)
def Analysis_sntt(self, x, category, crawltime, y, categorytag): data = x['content'] video = '' # 视频 audio = '' # 音频 data = json.loads(data) title = "" #标题 articleid = "" #文章标识 restype = 1 #类型 1 图文 2 图片 3 视频 logo = "" #图片 source = "" #来源 abstract = "" #摘要 try: abstract = data['abstract'] #摘要 except: SingleLogger().log.debug("无摘要") seq = y + 1 #排序 keywords = "" #关键字 gallary = "" #图片资讯图片地址 IsV = False #是否为大V资讯 IsArtID = False #是否无唯一标识,并为广告 url = "" #资讯地址 content = "" #内容 publish_time = "" #发布时间 publish_timestr = "" #发布时间 标准时间 hot = 0 #是否为热门 tab = "" #标签 try: tab = data['label'] except: tab = "" SingleLogger().log.debug("非置顶文章") try: url = data['display_url'] #分享地址 except: try: url = data['share_url'] #分享地址 except: try: url = data['url'] #分享地址 except: SingleLogger().log.debug("无资讯地址") try: publish_time = data['publish_time'] except: SingleLogger().log.debug("无发布时间") #如果是美图栏目 if category == "美图": try: title = data['content'] except: SingleLogger().log.debug("无内容仅只有图片") articleid = data['group_id'] #文章标识 logo = data['large_image']['url'] publish_time = data['create_time'] #如果是小视频栏目 elif category == "小视频": xdata = data['raw_data'] title = xdata['title'] articleid = xdata['group_id'] image_list = xdata['large_image_list'] for w in image_list: if w['url'] != "": logo += w['url'] + "," url = xdata['share']['share_url'] publish_time = xdata['create_time'] restype = 3 #如果是问答栏目 elif category == "问答": seq = y try: qdata = data['question'] qdata = json.loads(qdata) #转json title = qdata['title'] articleid = qdata['qid'] publish_time = qdata['create_time'] image_list = qdata['content']['large_image_list'] for w in image_list: if w['url'] != "": logo += w['url'] + "," except: return else: try: title = data['title'] except: SingleLogger().log.debug("无标题") try: articleid = data['group_id'] #文章标识 except: try: articleid = data['thread_id'] #文章标识 IsV = True try: #取文章图片列表 large_image_list = data['large_image_list'] for i, j in enumerate(large_image_list): if j['url'] != "": #取前3个图为列表图 if (i < 3): logo += j['url'] + "," gallary += j['url'] + "," except: SingleLogger().log.debug("大V资讯无图片") except: #若此资讯是广告,并无唯一标识,则生成一个唯一标识 if tab == "广告": articleid = uuid.uuid1() #文章标识 IsArtID = True SingleLogger().log.debug("无唯一标识:") #如果has_video=true 则为视频新闻 if data['has_video'] == True: restype = 3 else: try: keywords = data['keywords'] #关键字 except: SingleLogger().log.debug("无关键字") try: logo = data['middle_image']['url'] except: SingleLogger().log.debug("无封面图") #如果文章标识为空,则跳出此循环 if articleid == "": return try: source = data['source'] except: SingleLogger().log.debug("无来源") #是否为图片新闻 try: gallary_flag = data['gallary_flag'] if gallary_flag == 1: restype = 2 except: SingleLogger().log.debug("非图片新闻") try: hot = data['hot'] #0 非热门 1 热门 except: SingleLogger().log.debug("非热门") if hot == 1: if tab == "": tab = "热" else: tab = tab + ",热" #若restype=1为普通资讯 则可能存在多张图片 if restype == 1: try: image_list = data['image_list'] logo = "" for i in image_list: if i['url'] != "": logo += i['url'] + "," except: SingleLogger().log.debug("只一张或没有图片") if publish_time != "": publish_timestr = time.strftime("%Y-%m-%d %H:%M:%S", time.localtime(publish_time)) crawltimestr = time.strftime("%Y-%m-%d %H:%M:%S", time.localtime(crawltime / 1000)) #不为广告并有唯一标识 则取资讯详情信息 if IsArtID == False: if category == "问答": news_detail_url = 'http://is.snssdk.com/wenda/v2/question/brow/?device_id=48679316565' postData = {'qid': str(articleid), 'count': 30} #返回文章json news_detail = requests.post(news_detail_url, data=postData).json() qdata = news_detail['question'] content = qdata['content']['text'] url = qdata['share_data']['share_url'] #分享地址 adata = news_detail['data'] gallary = "" for a in adata: content += "<br/>" + a['answer']['content_abstract']['text'] image_list = a['answer']['content_abstract'][ 'large_image_list'] for w in image_list: if w['url'] != "": gallary += w['url'] + "," articleid = uuid.uuid1() #文章标识 else: #拼链接地址 news_detail_url = 'http://a3.bytecdn.cn/article/content/15/2/' + str( articleid) + '/' + str(articleid) + '/1/' if IsV: news_detail_url = 'http://lf.snssdk.com/ugc/thread/detail/v2/content/?thread_id=' + str( articleid) #返回文章json news_detail = requests.get(news_detail_url).json() else: #返回文章json news_detail = requests.get(news_detail_url).json()['data'] try: content = news_detail['content'] #若restype=2为图片新闻 则取图片地址 if restype == 2: gallery = news_detail['gallery'] for z in gallery: gallary += z['sub_image']['url'] + "," elif restype == 3: video = content except: SingleLogger().log.debug("内容暂无/图片取值错误") sdata = { "title": title, "description": abstract, "content": content, "source": source, "pubtimestr": publish_timestr, "pubtime": publish_time, "crawltimestr": crawltimestr, "crawltime": crawltime, "status": 0, "shorturl": url, "logo": logo, "labels": tab, "keyword": keywords, "seq": seq, "identity": str(articleid), "appname": self.appname, "app_tag": self.apptag, "category_tag": categorytag, "category": category, "restype": restype, "gallary": gallary, "video": video, "audio": audio } self.db(sdata, articleid, title)
def Analysis_fenghuang(self, data, category, crawltime, y, categorytag, lable): title = "" #标题 abstract = "" #摘要 articleid = "" #文章标识 tab = lable #标签 source = "" #来源 logo = "" #列表图片 url = "" #文章短地址 articletype = "" #文章展示类型(phvideo-视频 doc-图文) publish_time = "" #发布时间 时间戳 publish_timestr = "" #发布时间 标准时间str crawltimestr = "" #抓包时间 restype = 1 #类型 1 图文 2 图片 3 视频 seq = y + 1 #排序 keywords = "" #关键字 content = "" #内容 gallary = "" #图片资讯图片地址 detailJk = "" #详情接口 style = "" # 是否为热点 video = '' # 视频 audio = '' # 音频 #如果是推荐关注列表,直接return try: articletype = data['type'] #文章展示类型 if articletype == 'marquee2': return except: SingleLogger().log.debug("无articletype") return # 标题 try: title = data['title'] except: SingleLogger().log.debug('无标题') # 标签 try: style = data['style']['recomReason']['reasonName'] if tab == "": tab = style else: tab += "," + style except: SingleLogger().log.debug('无标签资讯') try: style = data['style']['attribute'] if tab == "": tab = style else: tab += "," + style except: SingleLogger().log.debug('无标签资讯') # 发布时间 try: publish_timestr = data['updateTime'] #发布时间 标准时间str timeArray = time.strptime(publish_timestr, "%Y-%m-%d %H:%M:%S") publish_time = int(time.mktime(timeArray)) #string转成时间戳 except: SingleLogger().log.debug("无发布时间") #无发布时间采用当前时间 publish_timestr = time.strftime("%Y-%m-%d %H:%M:%S", time.localtime(crawltime / 1000)) publish_time = crawltime # string转成时间戳 # 抓包时间 try: crawltimestr = time.strftime("%Y-%m-%d %H:%M:%S", time.localtime(crawltime / 1000)) # 抓包时间 except: SingleLogger().log.debug("获取抓包时间失败") # 来源 try: source = data['source'] except: SingleLogger().log.debug("没有来源或者是视频、广告") #没有link字段就return try: linkInfo = data['link'] except: return # 短地址 try: url = linkInfo['weburl'] except: SingleLogger().log.debug("无短地址") # 文章标识 try: articleid = data['documentId'] except: SingleLogger().log.debug("没有文章标识或者是广告") #列表图logo #style.images里面没有就取thumbnail try: images = data['style']['images'] if images and len(images) > 0: for imgobj in images: logo += imgobj + "," else: logo = data['thumbnail'] if logo == "": SingleLogger().log.debug('无列表图') except: try: logo = data['thumbnail'] except: SingleLogger().log.debug('无列表图') #分类处理 # 视频,或 小视频栏目 if articletype == 'phvideo' or articletype == "videoshortimg": restype = 3 try: source = data['phvideo']['channelName'] #来源 except: SingleLogger().log.debug("视频无来源") # 防止报错 try: guid = data['id'] #视频接口参数 articleid = guid # 文章标识 #没有MP4字段就调用详情接口 try: content = linkInfo['mp4'] except: detailJk = "http://api.3g.ifeng.com/api_phoenixtv_details?guid=" + guid #视频详情接口 postData = {} res = self.httpPost(detailJk, postData) content = res['singleVideoInfo'][0]['videoURLMid'] video = content except: SingleLogger().log.debug("获取视频详情失败") #图片栏目、图片新闻 elif articletype == "photo" or articletype == "slide": restype = 2 #防止报错 try: detailJk = linkInfo['url'] # 详情接口地址 postData = {} res = self.httpPost(detailJk, postData) #图片类型的如果有slide字段就取,没有就按照普通新闻的接口来 try: slides = res['body']['slides'] if len(slides) > 0: for sldobj in slides: curDesc = sldobj['description'] curImg = sldobj['image'] if curDesc != "": content += curDesc + "<br/>" if curImg != "": gallary += curImg + "," except: try: content = res['body']['text'] except: SingleLogger().log.debug("无text") try: gallaryList = res['body']['img'] if len(gallaryList) > 0: for gaobj in gallaryList: gallary += gaobj['url'] + "," except: SingleLogger().log.debug("详情没有图片") try: videos = res['body']['videos'] for vidobj in videos: video += vidobj['video']['Normal']['src'] + "," except: SingleLogger().log.debug("详情没有视频") except: SingleLogger().log.debug("获取图片详情失败") #广告 elif articletype == "advert": try: articleid = data['pid'] #文章标识 except: SingleLogger().log.debug("该广告没有文章标识") content = url #普通新闻 elif articletype == "doc": #防止报错 try: detailJk = linkInfo['url'] #详情接口地址 postData = {} res = self.httpPost(detailJk, postData) try: content = res['body']['text'] except: SingleLogger().log.debug("无text") try: gallaryList = res['body']['img'] if len(gallaryList) > 0: for gaobj in gallaryList: gallary += gaobj['url'] + "," except: SingleLogger().log.debug("详情没有图片") try: videos = res['body']['videos'] for vidobj in videos: video += vidobj['video']['Normal']['src'] + "," except: SingleLogger().log.debug("详情没有视频") except: SingleLogger().log.debug("获取图文详情失败") # 置顶新闻 elif articletype == "topic2": content = url sdata = { "title": title, "description": abstract, "content": content, # "source": source, "pubtimestr": publish_timestr, "pubtime": publish_time, "crawltimestr": crawltimestr, #抓包时间 "crawltime": crawltime, "status": 0, "shorturl": url, "logo": logo, "labels": tab, "keyword": "", "seq": seq, "identity": str(articleid), "appname": self.appname, "app_tag": self.apptag, "category_tag": categorytag, "category": category, #栏目 "restype": restype, #类型 "gallary": gallary, #里面的所有图片地址 "video": video, "audio": audio } self.db(sdata, articleid, title)