def search_all_comment(keyword): cookie = Cookie_Process.read_cookie() # 获取文件中存储的cookie mhf = mysqlHelper(get_db()[0], get_db()[1], get_db()[2], get_db()[3], get_db()[4], int(get_db()[5])) sqlf = "select wb_id from keyword_weibo where keyword = %s " all_wbid_temp = mhf.findAll(sqlf, keyword) #查询涉及到关键字的所有的微博id all_wbid = [] # 对要爬取的微博id列表进行去重 for i in all_wbid_temp: wbid = i[0] # print(wbid) if wbid not in all_wbid: all_wbid.append(wbid) wb_num = len(all_wbid) print('查询到涉及关键字为 {} 的微博总数量为 {}\n'.format(keyword,str(wb_num))) for i in range(len(all_wbid)): print("============ 正在爬取第 {} 条微博的评论 ===========".format(int(i)+1)) fetch_comment_data(str(all_wbid[i]),keyword,cookie)
def search_all_comment_id(userid): mhf = mysqlHelper(get_db()[0], get_db()[1], get_db()[2], get_db()[3], get_db()[4], int(get_db()[5])) sqlf = "select wb_id from user_weibo where wb_userid =%s" all_wbid_temp = mhf.findAll(sqlf, userid) #查询涉及到关键字的所有的微博id all_wbid = [] # 对要爬取的微博id列表进行去重 for i in all_wbid_temp: wbid = i[0] # print(wbid) if wbid not in all_wbid: all_wbid.append(wbid) wb_num = len(all_wbid) print("这个家伙有{}条微博已经被爬下来了\n".format(str(wb_num))) for i in range(len(all_wbid)): print("============ 正在爬取第 {} 条微博的评论 ===========".format(int(i) + 1)) fetch_comment_data(str(all_wbid[i]), userid)
def fetch_comment_data(wbid, userid): cookie = Cookie_Process.read_cookie() # 获取文件中存储的cookie cookies = {"Cookie": cookie} proxies = get_random_ip(ip_list) r_comment = requests.get('https://weibo.cn/comment/{}'.format(wbid), cookies=cookies, proxies=proxies) soup_comment = BeautifulSoup(r_comment.text, 'lxml') flag = False try: flag = soup_comment.select('.c')[-1].text.startswith('还没有人针对') except Exception as e: page_num = 1 if flag: print("--------- 此微博没有人评论! ---------\n") return else: try: page_num = int( soup_comment.select_one(".pa").text.split()[-1].split("/") [-1].split("页")[0]) except Exception as e: page_num = 1 mh = mysqlHelper(get_db()[0], get_db()[1], get_db()[2], get_db()[3], get_db()[4], int(get_db()[5])) sql = "insert into comment(wb_id,comment_content,comment_userid,comment_username,comment_like,comment_createtime,userid) values(%s,%s,%s,%s,%s,%s,%s)" page_id = 1 commentinfos = [] print("--------- 此微博 {} 的评论页数共有 {} 页 ---------\n".format(wbid, page_num)) while page_id < page_num + 1: time.sleep(random.uniform(4.5, 6.5)) #设置睡眠时间 print("++++++ 正在爬取此微博 {} 的第 {} 页评论...... ++++++\n".format( wbid, page_id)) r_comment = requests.get(url_comment.format(wbid, page_id), cookies=cookies) soup_comment = BeautifulSoup(r_comment.text, 'lxml') comment_list = soup_comment.select(".c") for l in comment_list: if str(l.get("id")).startswith("C_"): comment_content = filter_emoji(l.select_one(".ctt").text) comment_userid = l.select_one("a").get("href")[3:] comment_username = l.select_one("a").text comment_like = l.select_one(".cc").text.strip()[2:-1] comment_createtime = time_process( l.select_one(".ct").text.strip()[:-5]) print("评论内容 :" + comment_content) print("评论用户ID:" + comment_userid) print("评论用户名:" + comment_username) print("评论赞数 :" + comment_like) print("评论时间 :" + comment_createtime) print('----------------------------\n') commentinfo = { 'wb_id': wbid, # 生成一条评论信息的列表 'comment_content': comment_content, 'comment_userid': comment_userid, 'comment_username': comment_username, 'comment_like': comment_like, 'comment_createtime': comment_createtime, 'userid': userid } commentinfos.append(commentinfo) page_id = page_id + 1 if (len(commentinfos) >= 100): mh.open() for i in range(len(commentinfos)): mh.cud(sql, (commentinfos[i]['wb_id'], commentinfos[i]['comment_content'], commentinfos[i]['comment_userid'], commentinfos[i]['comment_username'], commentinfos[i]['comment_like'], commentinfos[i]['comment_createtime'], userid)) mh.tijiao() mh.close() commentinfos = [] if (len(commentinfos) > 0): mh.open() for i in range(len(commentinfos)): mh.cud( sql, (commentinfos[i]['wb_id'], commentinfos[i]['comment_content'], commentinfos[i]['comment_userid'], commentinfos[i]['comment_username'], commentinfos[i]['comment_like'], commentinfos[i]['comment_createtime'], userid)) mh.tijiao() mh.close() print("--------- 此微博的全部评论爬取完毕!---------\n\n")
def search_all_user(keyword): cookie = Cookie_Process.read_cookie() # 获取文件中存储的cookie mhf = mysqlHelper(get_db()[0], get_db()[1], get_db()[2], get_db()[3], get_db()[4], int(get_db()[5])) sqlf = "select wb_userid from keyword_weibo where keyword = %s " all_id_temp = mhf.findAll(sqlf, keyword) # 查询涉及到关键字的所有的用户ID all_id = [] # 对要爬取的用户id列表进行去重 for i in all_id_temp: if i not in all_id: all_id.append(i) id_num = len(all_id) print('查询到涉及关键字为 {} 的微博用户总数量为 {}\n'.format(keyword, str(id_num))) mh = mysqlHelper(get_db()[0], get_db()[1], get_db()[2], get_db()[3], get_db()[4], int(get_db()[5])) sql = "insert into keyword_userinfo(user_id,user_name,user_sex,user_address,user_weizhi,user_renzheng,user_oneword,user_wbnum,user_follow,user_fan,user_url,keyword) values(%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s)" userinfos = [] for i in range(len(all_id)): temp_user_data = fetch_user_data(int(all_id[i][0]), keyword, cookie) if (temp_user_data != None): # 判断返回的用户数据列表是否为空 userinfos.extend(temp_user_data) if ((i + 1) % 50 == 0): # 每多少条数据执行一次 提交 插入数据库操作 mh.open() for i in range(len(userinfos)): mh.cud( sql, (userinfos[i]['user_id'], userinfos[i]['user_name'], userinfos[i]['user_sex'], userinfos[i]['user_address'], userinfos[i]['user_weizhi'], userinfos[i]['user_renzheng'], userinfos[i]['user_oneword'], userinfos[i]['user_wbnum'], userinfos[i]['user_follow'], userinfos[i]['user_fan'], userinfos[i]['user_url'], keyword)) mh.tijiao() mh.close() userinfos = [] # 提交数据库之后将列表清空 if len(userinfos) > 0: # 将余下的数据提交数据库 mh.open() for i in range(len(userinfos)): mh.cud(sql, (userinfos[i]['user_id'], userinfos[i]['user_name'], userinfos[i]['user_sex'], userinfos[i]['user_address'], userinfos[i]['user_weizhi'], userinfos[i]['user_renzheng'], userinfos[i]['user_oneword'], userinfos[i]['user_wbnum'], userinfos[i]['user_follow'], userinfos[i]['user_fan'], userinfos[i]['user_url'], keyword)) mh.tijiao() mh.close()
def fetch_pages(user_id): cookie = Cookie_Process.read_cookie() # 获取文件中存储的cookie5625254784 cookies = {"Cookie": cookie} # 通过' https://weibo.cn/%d '网站微博第一页获取用户的用户名和总页数 proxies = get_random_ip(ip_list) url_user = "******" % (user_id, 1) r_user = requests.get(url_user, cookies=cookies, proxies=proxies) soup_user = BeautifulSoup(r_user.text, 'lxml') # 判断用户是否发表了微博,如没有,则返回 panduan_weibo = soup_user.select_one('.tc').text[3:-1] if panduan_weibo == '0': print('此用户微博数量为0!') return user_contents = soup_user.select_one('.ut').select('.ctt') temp_user = user_contents[0].text.split() wb_username = temp_user[0] # 获取微博用户名 # print(wb_username) try: page_num = int( soup_user.select_one('.pa').text.split()[1].split('/')[1] [:-1]) - 1 # 获取微博总页数 print('--------- 微博总页数为:' + str(page_num) + ' ---------\n') except Exception as e: page_num = 1 mblogs = [] # 此次时间单位内的搜索全部结果先临时用列表保存,后存入数据库 page_id = 1 while page_id <= page_num: try: mblogs.extend(fetch_weibo_data( user_id, wb_username, page_id)) # 每页调用fetch_data函数进行微博信息的抓取,中国日报4466 if (flag == 1): continue except Exception as e: print(e) if (page_id % 20 == 0): # 每多少条数据执行一次 提交 插入数据库操作 # 保存到mysql数据库 mh = mysqlHelper(get_db()[0], get_db()[1], get_db()[2], get_db()[3], get_db()[4], int(get_db()[5])) sql = "insert into user_weibo(wb_userid,wb_username,wb_id,wb_content,wb_createtime,wb_forwardnum,wb_commentnum,wb_likenum) values(%s,%s,%s,%s,%s,%s,%s,%s)" mh.open() for i in range(len(mblogs)): mh.cud( sql, (mblogs[i]['wb_userid'], mblogs[i]['wb_username'], mblogs[i]['wb_id'], filter_emoji(mblogs[i]['wb_content']), mblogs[i]['wb_createtime'], mblogs[i]['wb_forwardnum'], mblogs[i]['wb_commentnum'], mblogs[i]['wb_likenum'])) mh.tijiao() mh.close() mblogs = [] # 提交数据库之后将列表清空 page_id = page_id + 1 if len(mblogs) > 0: # 将余下的数据提交数据库 # 保存到mysql数据库 mh = mysqlHelper(get_db()[0], get_db()[1], get_db()[2], get_db()[3], get_db()[4], int(get_db()[5])) sql = "insert into user_weibo(wb_userid,wb_username,wb_id,wb_content,wb_createtime,wb_forwardnum,wb_commentnum,wb_likenum) values(%s,%s,%s,%s,%s,%s,%s,%s)" mh.open() for i in range(len(mblogs)): mh.cud(sql, (mblogs[i]['wb_userid'], mblogs[i]['wb_username'], mblogs[i]['wb_id'], filter_emoji(mblogs[i]['wb_content']), mblogs[i]['wb_createtime'], mblogs[i]['wb_forwardnum'], mblogs[i]['wb_commentnum'], mblogs[i]['wb_likenum'])) mh.tijiao() mh.close()