class sougou_1(): tools = tool() headers = tools.headers() dict_ = tools.dict_() def response(self, keyword): url = 'https://search.sohu.com/search/meta' data = tools.data(keyword) response = requests.get(url=url, params=data) datas = json.loads(response.text)['data']['media'] return datas def parse(self, data): #解析个人的信息并进行存储到mysql中,只有大于10000粉丝数的才会被存储 try: totalPv = data['scoreMap']['totalPv'] newsCount = data['scoreMap']['newsCount'] if totalPv >= 100000 and newsCount >= 20: author = data['userName'] # 用户昵称 home_url = data['weiboUrl'] # 用户主页地址 avatar = data['avatorUrl'] # 用户头像地址 if avatar.split('//')[0] == "http:": avatar_url = avatar else: avatar_url = 'http:' + avatar brief = data['description'] # 作者简介 # fans_num = re.findall('fans_num.*?:\"(.*?)\",', html)[0] # 粉丝数量 create_time = time.strftime("%Y-%m-%d %H:%M:%S", time.localtime()) # 创建时间 source_name = '搜狐号' # 来源名称 biz = "souhu" + str(data['id']) # follow_num = re.findall('follow_num.*?:\"(.*?)\",', html)[0] #关注数 tags = keyword sql1 = 'select * from spider_user_info where author="%s"' % author cursor = self.tools.sqll(sql1) result = cursor.fetchall() if not result: sql2 = 'insert into spider_user_info(id,author,biz,avatar_url,home_url, source_name, brief, create_time,tags) values("%s","%s","%s","%s","%s","%s","%s","%s","%s")' % ( id, author, biz, avatar_url, home_url, source_name, brief, create_time, tags) self.tools.sqll(sql2) print(id, biz, author, home_url, avatar_url, source_name, brief, create_time, tags) else: pass else: pass except Exception as e: print(e)
class qiehao_1(): tools = tool() headers = tools.headers() dict_ = tools.dict_() def response(self, keyword): url = 'https://r.inews.qq.com/verticalSearch?chlid=_qqnews_custom_search_qiehao&search_from=click&uid=44ce3651532c37f9&omgid=e76c5bfab95b6547ffab46fb08c39bd795f60010213414&trueVersion=5.8.12&qimei=44ce3651532c37f9&appver=25_android_5.8.12&devid=44ce3651532c37f9&Cookie=lskey%3D;skey%3D;uin%3D;%20luin%3D;logintype%3D0;%20main_login%3D;%20&qn-sig=f50e2c8c758767a6bc87be6605573722&qn-rid=219c9f88-e74a-4670-bb7d-3497cec83c8a' data = tools.data(keyword) response = requests.post(url, data=data, headers=self.headers, timeout=15) html = json.loads(response.text) datas = html['secList'] return datas def fan_num(self, url): try: fans_num = json.loads( requests.get(url).text)['channelInfo']['subCount'] except: fans_num = 0 return fans_num def parse(self, data): #解析个人的信息并进行存储到mysql中,只有大于10000粉丝数的才会被存储 try: author = data['chlname'] # 用户昵称 biz = "qiehao" + str(data['chlid']) home_url = 'https://r.inews.qq.com/getSubItem?chlid={}'.format( data['chlid']) # 用户主页地址 avatar_url = data['imgurl'] # 用户头像地址 brief = data['abstract'] # 作者简介 create_time = time.strftime("%Y-%m-%d %H:%M:%S", time.localtime()) # 创建时间 source_name = '企鹅号' # 来源名称 fans_num = int(self.fan_num(home_url)) # 粉丝数量 tags = keyword sql1 = 'select * from spider_user_info where author="%s"' % author cursor = self.tools.sqll(sql1) result = cursor.fetchall() if not result and fans_num >= 1: sql2 = 'insert into spider_user_info(id,author,biz,home_url,avatar_url,brief,create_time,source_name,fans_num,tags) values("%s","%s","%s","%s","%s","%s","%s","%s","%s","%s")' % ( id, author, biz, home_url, avatar_url, brief, create_time, source_name, fans_num, tags) self.tools.sqll(sql2) print(id, author, biz, home_url, avatar_url, brief, create_time, source_name, fans_num, tags) else: pass except Exception as e: print(e)
class baijia_2(): tools = tool() headers = tools.headers() dict_ = tools.dict_() browser = tools.browser() def checksql(self): #通过判断status的数值判断书否已经爬取过这个人的关注的人 sql1='select biz from spider_user_info where status = 0 and source_name = "百家号" limit 0,1 ' cursor = self.tools.sqll(sql1) result = cursor.fetchall() biz=result[0]['biz'] uk=biz.split('/')[1] sql2='update spider_user_info set status=1 where biz="%s"' % biz self.tools.sqll(sql2) return uk def response(self,id): #获取这个人的关注的好友列表页面源码 for i in self.dict_: self.browser.add_cookie({ 'name': i, 'value': self.dict_[i], }) if id.isdigit(): url = 'https://author.baidu.com/profile?context={%22from%22:%22dusite_sresults%22,%22app_id%22:%22' + id + '%22}&cmdType=&pagelets[]=root&reqID=0&ispeed=1' else: url = 'https://author.baidu.com/profile?context={%22from%22:%22dusite_sresults%22,%22uk%22:%22' + id + '%22}&cmdType=&pagelets[]=root&reqID=0&ispeed=1' try: self.browser.get(url) except: pass html = self.browser.page_source return html def fans(self,uk): # 获取关注的信息 ids=[] try: url = 'https://mbd.baidu.com/webpage?action=personaljumpsublist&type=subscribe&uk={}'.format(uk) response = requests.get(url=url) response.encoding = 'utf-8' html = response.text datas = json.loads(html) follows = datas['data']['follow_list']['modify'] for follow in follows: id = follow['third_id'] ids.append(id) return ids except Exception as e: print(e) return ids def parse(self,id): # 解析他所关注的人的个人信息,并检查是否存在于数据库,不存在插入 html=self.response(id) try: html = html.replace("\\", "") author = re.findall('display_name\":\"(.*?)\"', html)[0].encode("gb18030", "ignore").decode("utf8","ignore").replace("\\", "") # 用户昵称 third_id=re.findall('third_id\":\"(.*?)",', html)[0] home_url = 'https://author.baidu.com/home/' + third_id # 用户主页地址 avatar_url = re.findall('avatar_raw\":\"(.*?)",', html)[0] # 用户头像地址 brief = str(re.findall('sign\":\"(.*?)\",', html)[0]).encode("gb18030", "ignore").decode("utf8","ignore").replace("\\","") # 作者简介 fans_num = re.findall('fans_num.*?:\"(.*?)\",', html)[0] # 粉丝数量 create_time = time.strftime("%Y-%m-%d %H:%M:%S", time.localtime()) # 创建时间 source_name = '百家号' # 来源名称 follow_num = re.findall('follow_num.*?:\"(.*?)\",', html)[0] # 关注数 uk = re.findall('uk.*?:\"(.*?)\",', html)[0] # uk码 biz = third_id + '/' + uk if int(fans_num) >= 10000: sql3='select biz from spider_user_info where biz="%s"' % biz cursor = self.tools.sqll(sql3) result = cursor.fetchall() if not result: sql4='insert into spider_user_info(author,home_url,fans_num,avatar_url,source_name,brief,biz,create_time) values("%s","%s","%s","%s","%s","%s","%s","%s")' % ( author, home_url, fans_num, avatar_url, source_name, brief, biz, create_time) self.tools.sqll(sql4) print(author, home_url, third_id, fans_num, avatar_url, source_name, brief, biz, create_time,follow_num, uk) else: pass except Exception as e: print(e)
class baijia_1(): from uitls import tool tools = tool() headers = tools.headers() dict_ = tools.dict_() browser = tools.browser() def get_author(self, i, j): #获取列表作者的个人主页url datas = { "word": "{}+百家号".format(i), "pd": "cambrian_list", "atn": "index", "title": "{}+百家号".format(i), "lid": "8219726775998088715", "ms": "1", "frsrcid": "206", "frorder": "1", "sig": "593303", "pn": 10 * j, "mod": "1", } url = 'https://m.baidu.com/sf' response = requests.get(url, params=datas, headers=self.headers) response.encoding = 'utf-8' html = response.text tree = etree.HTML(html) datas = tree.xpath('//div[@class="sfc-cambrian-list-subscribe"]') urls = [] for data in datas: url = data.xpath('./div/a/@href')[0] urls.append(url) return urls def get_id(self, url): #获取作者id拿到得 response = requests.get(url=url) response.encoding = 'utf-8' html = response.text try: app_id = re.findall('home/(.*)\?from=dusite_sresults"', html)[0] return app_id except: pass def homepage(self, app_id): # 通过id获取个人主页页面信息 for i in self.dict_: self.browser.add_cookie({ 'name': i, 'value': self.dict_[i], }) url = 'https://author.baidu.com/profile?context={%22from%22:%22dusite_sresults%22,%22app_id%22:%22' + str( app_id) + '%22}&cmdType=&pagelets[]=root&reqID=0&ispeed=1' self.browser.get(url) html = self.browser.page_source return html def parse(self, html): #解析个人的信息并进行存储到mysql中,只有大于10000粉丝数的才会被存储 html = html.replace("\\", "") author = re.findall('display_name\":\"(.*?)\"', html)[0].encode( "gb18030", "ignore").decode("utf8", "ignore").replace("\\", "") # 用户昵称 home_url = 'https://author.baidu.com/home/' + app_id # 用户主页地址 avatar_url = re.findall('avatar_raw\":\"(.*?)",', html)[0] # 用户头像地址 brief = str(re.findall('sign\":\"(.*?)\",', html)[0]).encode( "gb18030", "ignore").decode("utf8", "ignore").replace("\\", "") # 作者简介 fans_num = re.findall('fans_num.*?:\"(.*?)\",', html)[0] # 粉丝数量 create_time = time.strftime("%Y-%m-%d %H:%M:%S", time.localtime()) # 创建时间 source_name = '百家号' # 来源名称 follow_num = re.findall('follow_num.*?:\"(.*?)\",', html)[0] #关注数 uk = re.findall('uk.*?:\"(.*?)\",', html)[0] #uk码 biz = app_id + '/' + uk if int(fans_num) >= 10000: sql1 = 'select biz from spider_user_info where biz="%s"' % biz cursor = self.tools.sqll(sql1) result = cursor.fetchall() if not result: sql2 = 'insert into spider_user_info(author,home_url,fans_num,avatar_url,source_name,brief,biz,create_time) values("%s","%s","%s","%s","%s","%s","%s","%s")' % ( author, home_url, fans_num, avatar_url, source_name, brief, biz, create_time) self.tools.sqll(sql2) print(author, home_url, fans_num, avatar_url, source_name, brief, biz, create_time, follow_num, uk) else: pass
source_name = '百家号' # 来源名称 follow_num = re.findall('follow_num.*?:\"(.*?)\",', html)[0] #关注数 uk = re.findall('uk.*?:\"(.*?)\",', html)[0] #uk码 biz = app_id + '/' + uk if int(fans_num) >= 10000: sql1 = 'select biz from spider_user_info where biz="%s"' % biz cursor = self.tools.sqll(sql1) result = cursor.fetchall() if not result: sql2 = 'insert into spider_user_info(author,home_url,fans_num,avatar_url,source_name,brief,biz,create_time) values("%s","%s","%s","%s","%s","%s","%s","%s")' % ( author, home_url, fans_num, avatar_url, source_name, brief, biz, create_time) self.tools.sqll(sql2) print(author, home_url, fans_num, avatar_url, source_name, brief, biz, create_time, follow_num, uk) else: pass if __name__ == '__main__': baijia_1 = baijia_1() tools = tool() list_ = tools.list_() for i in list_: for j in range(0, 10): urls = baijia_1.get_author(i, j) for url in urls: app_id = baijia_1.get_id(url) html = baijia_1.homepage(app_id) baijia_1.parse(html)
class souhu_2(): tools = tool() dict_ = tools.dict_() def checksql(self): # 检查没有爬取过文章的作者信息2 sql1 = 'select * from spider_user_info where status = 0 and source_name = "搜狐号" limit 0,1 ' cursor = self.tools.sqll(sql1) result = cursor.fetchall() info = result[0] return info def article(self, info): #获取100篇文章文章的页面 author_id = info['biz'].replace('souhu', "") datas = [] try: for i in range(1, 6): url = 'https://v2.sohu.com/author-page-api/author-articles/wap/{}?pNo={}'.format( author_id, i) response = requests.get(url=url) data = json.loads(response.text)['data']['wapArticleVOS'] datas.extend(data) return datas except: return datas def read_num(self, article_id): # 获取点赞量和阅读量 url = 'https://v2.sohu.com/author-page-api/articles/pv?articleIds={}'.format( article_id) response = requests.get(url=url) num = json.loads(response.text)[str(article_id)] return num def content(self, article_id): content_replys = [] url = 'https://api.interaction.sohu.com/api/comments/maincomments?source_id=mp_{}&page_no=1&page_size=10&reply_count=10&type=0'.format( article_id) try: response = requests.get(url=url) comments = json.loads(response.text) for comment in comments: content_reply = { 'thumb_num': comment['displayStatus'], 'content': comment['content'] } content_replys.append(content_reply) return content_replys except: content_replys = None return content_replys def parse(self, info): datas = self.article(info) if len(datas) == 0: pass else: return datas def parse_2(self, data): article = {} try: article_id = data['id'] num = self.read_num(article_id) article['read_num'] = num article['author'] = info['author'] article['avatar_url'] = info['avatar_url'] article['title'] = data['title'] article['source_url'] = "https://" + data['link'] article['source_name'] = '搜狐号' img_url = data['cover'] if img_url.split('//')[0] == "http:": article['img_url'] = img_url else: article['img_url'] = 'http:' + img_url article['published_time'] = time.strftime( "%Y-%m-%d %H:%M:%S", time.localtime(int(str(data['publicTime'])[:-3]))) article["author_info"] = { "biz": info['biz'].replace('souhu', ''), # bizID "brief": self.utils.filter_emoji(info['brief']) # 摘要信息 } content_replys = self.content(str(article_id)) article['content_reply'] = content_replys print(article) except: pass
class baijia_3(): tools = tool() headers = tools.headers() dict_ = tools.dict_() browser = tools.browser() def checksql(self): #检查没有爬取过文章的作者信息 sql1 = 'select * from spider_user_info where status = 1 and source_name = "百家号" limit 0,1 ' cursor = self.tools.sqll(sql1) result = cursor.fetchall() info = result[0] return info def article(self, info): #获取100篇文章文章的页面 uk = info['biz'].split('/')[1] for i in self.dict_: self.browser.add_cookie({ 'name': i, 'value': self.dict_[i], }) url = 'https://author.baidu.com/list?type=article&tab=2&uk={}&num=100'.format( uk) try: self.browser.get(url) except: pass html = self.browser.page_source return html, uk def read_point(self, dynamic, thread, uk): #获取点赞量和阅读量 bian = '%5b%7b%22user_type%22%3a%223%22%2c%22dynamic_id%22%3a%22{}%22%2c%22dynamic_type%22%3a%222%22%2c%22dynamic_sub_type%22%3a%222001%22%2c%22thread_id%22%3a%22{}%22%2c%22feed_id%22%3a%22{}%22%7d%5d'.format( dynamic, thread, dynamic) response = requests.get( 'https://mbd.baidu.com/webpage?type=homepage&action=interact&format=jsonp¶ms={}&uk={}' .format(bian, uk), timeout=10) nums = json.loads( response.text.replace('callback(', '').replace(')', ''))['data']['user_list'] return nums def content(self, thread): #获取评论数 comment_url = 'https://ext.baidu.com/api/comment/v1/comment/getlist?appid=101&start=0&num=10&thread_id={}'.format( thread) try: r1 = requests.get(url=comment_url, timeout=10) comments = json.loads(r1.text)['ret']['list'] content_replys = [] if len(comments) >= 10: for i in range(10): content_reply = { 'thumb_num': comments[i]['like_count'], 'content': comments[i]['content'] } content_replys.append(content_reply) return content_replys else: for comment in comments: content_reply = { 'thumb_num': comment['like_count'], 'content': comment['content'] } content_replys.append(content_reply) return content_replys except: content_replys = None return content_replys def parse(self, info): html, uk = self.article(info) articles = [] try: new = json.loads(re.findall('.*">(.*?)<', html)[0]) datas = new['data']['list'] for data in datas: article = {} try: dynamic = data['dataAttrs']['dynamic-id'] thread = data['dataAttrs']['thread-id'] nums = self.read_point(dynamic, thread, uk) article['reply_num'] = tuple( nums.values())[0]['comment_num'] #评价数 article['read_num'] = tuple( nums.values())[0]['read_num'] #阅读量 article['thumb_num'] = tuple( nums.values())[0]['praise_num'] # 点赞数 article['author'] = info['author'] # 作者 article['avatar_url'] = info['avatar_url'] # 头像 article['title'] = data['title'] # 文章标题 article['source_url'] = data['url'] # 文章链接 article['source_name'] = '百家号' article['img_url'] = data['cover_images'][0][ 'src'] #文章图片,有三张取第一个 article['published_time'] = time.strftime( "%Y-%m-%d %H:%M:%S", time.localtime(data['publish_at'])) #创作时间 article["author_info"] = { "biz": info['biz'], # bizID "fans_num": info['fans_num'], # 粉丝数量 "brief": (info['brief']) # 摘要信息 } content_replys = self.content(thread) article['content_reply'] = content_replys #评价信息 print(article) articles.append(article) except: pass biz = info['biz'] sql2 = 'update spider_user_info set status=2 where biz="%s"' % biz self.tools.sqll(sql2) return articles except Exception as e: print(e) biz = info['biz'] sql3 = 'update spider_user_info set status=3 where biz="%s"' % biz self.tools.sqll(sql3) return articles