def get_zhuanlan_comment(headers, number, fp=None): tool = Tool() writer = csv.writer(fp) url_web = 'https://zhuanlan.zhihu.com/api/posts/{}/comments?limit=10&offset=' #专栏评论 base_url = url_web.format(number) + '{}' url_number = 'https://zhuanlan.zhihu.com/p/' + number #专栏文章url request = requests.get(url_number, headers=headers) soup = BeautifulSoup(request.text, 'lxml') title = soup.select('head > title') commentCount = soup.select( '#react-root > div > div > div.Layout-main.av-card.av-paddingBottom.av-bodyFont.Layout-titleImage--normal > div.PostComment > div.BlockTitle.av-marginLeft.av-borderColor.PostComment-blockTitle > span.BlockTitle-title' ) commentCount = commentCount[0].get_text().split()[0] if commentCount == '还没有评论': return all_comment_num = int(commentCount) if all_comment_num % 10 != 0: count = 1 + all_comment_num // 10 else: count = all_comment_num // 10 for i in range(count): url_contents = base_url.format(i * 10) wb_data = requests.get(url_contents, headers=headers) js = json.loads(wb_data.content) for each in js: con = tool.replace(each['content']) writer.writerow([title[0].get_text(), con]) print(title[0].get_text(), con) time.sleep(random.uniform(2, 4)) # get_zhuanlan_comment(headers,'28047189')
def get_contents(headers, number, fp): tool = Tool() writer = csv.writer(fp) url_web = 'https://www.zhihu.com/api/v4/questions/{}/answers?include=data%5B*%5D.is_normal%2Cadmin_closed_comment%2Creward_info%2Cis_collapsed%2Cannotation_action%2Cannotation_detail%2Ccollapse_reason%2Cis_sticky%2Ccollapsed_by%2Csuggest_edit%2Ccomment_count%2Ccan_comment%2Ccontent%2Ceditable_content%2Cvoteup_count%2Creshipment_settings%2Ccomment_permission%2Ccreated_time%2Cupdated_time%2Creview_info%2Cquestion%2Cexcerpt%2Crelationship.is_authorized%2Cis_author%2Cvoting%2Cis_thanked%2Cis_nothelp%2Cupvoted_followees%3Bdata%5B*%5D.mark_infos%5B*%5D.url%3Bdata%5B*%5D.author.follower_count%2Cbadge%5B%3F(type%3Dbest_answerer)%5D.topics&limit=20&sort_by=default&offset=' base_url = url_web.format(number) + '{}' url_number = 'https://www.zhihu.com/question/' + str(number) wb = requests.get(url_number, headers=headers) sp = BeautifulSoup(wb.text, 'lxml') num_sel = sp.select( '#QuestionAnswers-answers > div > div > div.List-header > h4 > span') follower = sp.select( '#root > div > main > div > div > div.QuestionHeader > div.QuestionHeader-content > div.QuestionHeader-side > div > div > div > button > div.NumberBoard-value' ) title = sp.select( '#root > div > main > div > div > div.QuestionHeader > div.QuestionHeader-content > div.QuestionHeader-main > h1' ) browsed = sp.select( '#root > div > main > div > div > div.QuestionHeader > div.QuestionHeader-content > div.QuestionHeader-side > div > div > div > div.NumberBoard-item > div.NumberBoard-value' ) if num_sel == []: return all_answer_num = int(num_sel[0].get_text().split()[0]) if all_answer_num % 20 != 0: count = 1 + all_answer_num // 20 else: count = all_answer_num // 20 for i in range(count): url_contents = base_url.format(i * 20) wb_data = requests.get(url_contents, headers=headers) js = json.loads(wb_data.content) for each in js['data']: con = tool.replace(each['content']) timestamp = each['created_time'] # 转换成localtime time_local = time.localtime(timestamp) # 转换成新的时间格式 dt = time.strftime("%Y-%m-%d %H:%M:%S", time_local) times = time_local.tm_mon + time_local.tm_mday / 100 if (times < 6.18 or times > 8.28): pass else: writer.writerow([ title[0].get_text(), follower[0].get_text(), browsed[0].get_text(), dt, con ]) print(title[0].get_text(), follower[0].get_text(), browsed[0].get_text(), dt, con) time.sleep(random.uniform(2, 4))
class Baidutieba: #参数seeLz:是否只看楼主, floorTag:是否写入楼层分隔符 def __init__(self, baseUrl, seeLz, floorTag): self.baseUrl = baseUrl self.user_agent = 'Mozilla/4.0 (compatible; MSIE 5.5; Windows NT)' self.headers = {'User-Agent' : self.user_agent, 'Content-Type':'text/html; charset=UTF-8', 'Vary':'Accept-Encoding'} self.seeLz = '?see_lz=' + str(seeLz) self.tool = Tool() self.floor = 1 self.defaultTitle = u"百度贴吧" self.floorTag = floorTag def getPage(self, pageNum): try: url = self.baseUrl + self.seeLz + "&pn=" + str(pageNum) request = urllib2.Request(url, headers=self.headers) response = urllib2.urlopen(request) #print response.read().decode('utf-8') return response.read().decode('utf-8') except urllib2.URLError as e: if hasattr(e, "reason"): print u"连接百度贴吧失败,错误原因:", e.reason return None # 获取帖子标题 def getTitle(self, page): pattern = re.compile('<h3 class="core_title_txt.*?>(.*?)</h3>', re.S) result = re.search(pattern, page) if result: #print result.group(1) return result.group(1).strip() else: return None # 提取帖子页数 def getPageNums(self, page): pattern = re.compile('<li class="l_reply_num.*?</span>.*?<span.*?>(.*?)</span>', re.S) result = re.search(pattern, page) if result: #print result.group(1) return result.group(1).strip() else: return None # 获取每一层楼的内容,传入页面内容 def getContent(self, page): pattern = re.compile('<div id="post_content_.*?>(.*?)<.div>', re.S) items = re.findall(pattern, page) contents = [] for item in items: content = "\n" + self.tool.replace(item) + "\n" contents.append(content.encode('utf-8')) return contents def setFileTitle(self, title): if title is not None: self.file = open(title + ".txt", "w+") else: self.file = open(self.defaultTitle + ".txt", "w+") def writeData(self, contents): for item in contents: if str(self.floorTag) == "1": floorLine = "\n" + str(self.floor) + u"-----------------------------------------------------------------------------------------\n" self.file.write(floorLine) self.file.write(item) self.floor += 1 def start(self): indexPage = self.getPage(1) pageNums = self.getPageNums(indexPage) title = self.getTitle(indexPage) self.setFileTitle(title) if pageNums == None: print u"URL失效,请重试" return try: print "该帖子共有" + str(pageNums) + "页" for i in range(1, int(pageNums) + 1): print "正在写入第" + str(i) + "页数据" page = self.getPage(i) contents = self.getContent(page) self.writeData(contents) except IOError as e: print "写入文件异常,原因:" + e.message finally: print u"写入文件完毕"
def get_zhuanfa(self): excel = xlwt.Workbook(encoding='utf-8') sheet = excel.add_sheet('sheet1') sheet.write(0, 0, 'id') sheet.write(0, 1, 'name') sheet.write(0, 2, 'time') sheet.write(0, 3, 'text') sheet.write(0, 4, 'likes') sheet.write(0, 5, 'loc') sheet.write(0, 6, 'sex') for m in range(7, 12): sheet.write(0, m, '表情' + str(m-6)) count = 0 i = 0 while i <= 101 and count < 5000: #这个地方改数据条数 url = 'https://m.weibo.cn/api/comments/show?id=4336744058977011&page=' #这个地方改数字比如4308796748087542 i = i + 1 url = url + str(i) print(url) try: response = requests.get(url, headers=headers) time.sleep(1) resjson = json.loads(response.text) time.sleep(1) dataset = resjson.get('data') data = dataset.get('data') for j in range(0, len(data)): try: temp = data[j] # if temp.get('reply_id') is not None: # continue user = temp.get('user') text = temp.get('text') created_at = temp.get('created_at') attitudes_count = temp.get('attitudes_count') userid = user.get('id') info_url = "https://m.weibo.cn/api/container/getIndex?containerid=230283" + str( userid) + "_-_INFO" # 转发人信息的url r = requests.get(info_url) infojson = json.loads(r.text) infodata = infojson.get('data') cards = infodata.get('cards') sex = '' loc = '' for l in range(0, len(cards)): temp = cards[l] card_group = temp.get('card_group') for m in range(0, len(card_group)): s = card_group[m] if s.get('item_name') == '性别': sex = s.get('item_content') if s.get('item_name') == '所在地': loc = s.get('item_content') loc = re.split(r' ', loc)[0] if sex is None: sex = '未知' if loc is None: loc = '未知' screen_name = user.get('screen_name') count += 1 if get_emoj(text) is not None: emoji = get_emoj(text) l = 7 while emoji: sheet.write(count, l, str(emoji.pop())) l += 1 #File.write(text.encode('utf-8') + '\n') sheet.write(count, 0, userid) sheet.write(count, 1, str(screen_name)) sheet.write(count, 2, created_at) text = deleteAite(text) text = Tool.replace(text) if text is not None: sheet.write(count, 3, text.encode('utf-8')) sheet.write(count, 4, attitudes_count) sheet.write(count, 5, str(loc)) sheet.write(count, 6, str(sex)) except Exception as e: print (e) time.sleep(3) print ("已经获取" + str(count) + "条数据") time.sleep(8) except Exception as e: print (e) # File.close() excel.save('单身人群超过2000.xls') #这个地方改文件名称