示例#1
0
def get_zhuanlan_comment(headers, number, fp=None):

    tool = Tool()
    writer = csv.writer(fp)

    url_web = 'https://zhuanlan.zhihu.com/api/posts/{}/comments?limit=10&offset='  #专栏评论
    base_url = url_web.format(number) + '{}'
    url_number = 'https://zhuanlan.zhihu.com/p/' + number  #专栏文章url
    request = requests.get(url_number, headers=headers)
    soup = BeautifulSoup(request.text, 'lxml')
    title = soup.select('head > title')
    commentCount = soup.select(
        '#react-root > div > div > div.Layout-main.av-card.av-paddingBottom.av-bodyFont.Layout-titleImage--normal > div.PostComment > div.BlockTitle.av-marginLeft.av-borderColor.PostComment-blockTitle > span.BlockTitle-title'
    )
    commentCount = commentCount[0].get_text().split()[0]
    if commentCount == '还没有评论':
        return
    all_comment_num = int(commentCount)
    if all_comment_num % 10 != 0:
        count = 1 + all_comment_num // 10
    else:
        count = all_comment_num // 10
    for i in range(count):
        url_contents = base_url.format(i * 10)
        wb_data = requests.get(url_contents, headers=headers)
        js = json.loads(wb_data.content)
        for each in js:
            con = tool.replace(each['content'])
            writer.writerow([title[0].get_text(), con])
            print(title[0].get_text(), con)
    time.sleep(random.uniform(2, 4))


# get_zhuanlan_comment(headers,'28047189')
示例#2
0
def get_contents(headers, number, fp):

    tool = Tool()

    writer = csv.writer(fp)

    url_web = 'https://www.zhihu.com/api/v4/questions/{}/answers?include=data%5B*%5D.is_normal%2Cadmin_closed_comment%2Creward_info%2Cis_collapsed%2Cannotation_action%2Cannotation_detail%2Ccollapse_reason%2Cis_sticky%2Ccollapsed_by%2Csuggest_edit%2Ccomment_count%2Ccan_comment%2Ccontent%2Ceditable_content%2Cvoteup_count%2Creshipment_settings%2Ccomment_permission%2Ccreated_time%2Cupdated_time%2Creview_info%2Cquestion%2Cexcerpt%2Crelationship.is_authorized%2Cis_author%2Cvoting%2Cis_thanked%2Cis_nothelp%2Cupvoted_followees%3Bdata%5B*%5D.mark_infos%5B*%5D.url%3Bdata%5B*%5D.author.follower_count%2Cbadge%5B%3F(type%3Dbest_answerer)%5D.topics&limit=20&sort_by=default&offset='
    base_url = url_web.format(number) + '{}'
    url_number = 'https://www.zhihu.com/question/' + str(number)
    wb = requests.get(url_number, headers=headers)
    sp = BeautifulSoup(wb.text, 'lxml')
    num_sel = sp.select(
        '#QuestionAnswers-answers > div > div > div.List-header > h4 > span')
    follower = sp.select(
        '#root > div > main > div > div > div.QuestionHeader > div.QuestionHeader-content > div.QuestionHeader-side > div > div > div > button > div.NumberBoard-value'
    )
    title = sp.select(
        '#root > div > main > div > div > div.QuestionHeader > div.QuestionHeader-content > div.QuestionHeader-main > h1'
    )
    browsed = sp.select(
        '#root > div > main > div > div > div.QuestionHeader > div.QuestionHeader-content > div.QuestionHeader-side > div > div > div > div.NumberBoard-item > div.NumberBoard-value'
    )
    if num_sel == []:
        return
    all_answer_num = int(num_sel[0].get_text().split()[0])
    if all_answer_num % 20 != 0:
        count = 1 + all_answer_num // 20
    else:
        count = all_answer_num // 20
    for i in range(count):
        url_contents = base_url.format(i * 20)
        wb_data = requests.get(url_contents, headers=headers)
        js = json.loads(wb_data.content)
        for each in js['data']:
            con = tool.replace(each['content'])
            timestamp = each['created_time']  # 转换成localtime
            time_local = time.localtime(timestamp)  # 转换成新的时间格式
            dt = time.strftime("%Y-%m-%d %H:%M:%S", time_local)
            times = time_local.tm_mon + time_local.tm_mday / 100
            if (times < 6.18 or times > 8.28):
                pass
            else:
                writer.writerow([
                    title[0].get_text(), follower[0].get_text(),
                    browsed[0].get_text(), dt, con
                ])
                print(title[0].get_text(), follower[0].get_text(),
                      browsed[0].get_text(), dt, con)
    time.sleep(random.uniform(2, 4))
示例#3
0
class Baidutieba:
    #参数seeLz:是否只看楼主, floorTag:是否写入楼层分隔符 
    def __init__(self, baseUrl, seeLz, floorTag):
        self.baseUrl = baseUrl
        self.user_agent = 'Mozilla/4.0 (compatible; MSIE 5.5; Windows NT)'
        self.headers = {'User-Agent' : self.user_agent, 'Content-Type':'text/html; charset=UTF-8', 'Vary':'Accept-Encoding'}
        self.seeLz = '?see_lz=' + str(seeLz)
        self.tool = Tool()
        self.floor = 1
        self.defaultTitle = u"百度贴吧"
        self.floorTag = floorTag
    
    def getPage(self, pageNum):
        try:
            url = self.baseUrl + self.seeLz + "&pn=" + str(pageNum)
            request = urllib2.Request(url, headers=self.headers)
            response = urllib2.urlopen(request)
            #print response.read().decode('utf-8')
            return response.read().decode('utf-8')
        except urllib2.URLError as e:
            if hasattr(e, "reason"):
                print u"连接百度贴吧失败,错误原因:", e.reason
                return None

    # 获取帖子标题
    def getTitle(self, page):
        pattern = re.compile('<h3 class="core_title_txt.*?>(.*?)</h3>', re.S)
        result = re.search(pattern, page)
        if result:
            #print result.group(1)
            return result.group(1).strip()
        else:
            return None


    # 提取帖子页数
    def getPageNums(self, page):
        pattern = re.compile('<li class="l_reply_num.*?</span>.*?<span.*?>(.*?)</span>', re.S)
        result = re.search(pattern, page)
        if result:
            #print result.group(1)
            return result.group(1).strip()
        else:
            return None


    # 获取每一层楼的内容,传入页面内容
    def getContent(self, page):
        pattern = re.compile('<div id="post_content_.*?>(.*?)<.div>', re.S)
        items = re.findall(pattern, page)
        contents = []
        for item in items:
            content = "\n" + self.tool.replace(item) + "\n"
            contents.append(content.encode('utf-8'))
        return contents


    def setFileTitle(self, title):
        if title is not None:
            self.file = open(title + ".txt", "w+")
        else:
            self.file = open(self.defaultTitle + ".txt", "w+")


    def writeData(self, contents):
        for item in contents:
            if str(self.floorTag) == "1":
                floorLine = "\n" + str(self.floor) + u"-----------------------------------------------------------------------------------------\n"
                self.file.write(floorLine)
            self.file.write(item)
            self.floor += 1


    def start(self):
        indexPage = self.getPage(1)
        pageNums = self.getPageNums(indexPage)
        title = self.getTitle(indexPage)
        self.setFileTitle(title)
        if pageNums == None:
            print u"URL失效,请重试"
            return
        try:
            print "该帖子共有" + str(pageNums) + "页"
            for i in range(1, int(pageNums) + 1):
                print "正在写入第" + str(i) + "页数据"
                page = self.getPage(i)
                contents = self.getContent(page)
                self.writeData(contents)
        except IOError as e:
            print "写入文件异常,原因:" + e.message
        finally:
            print u"写入文件完毕"
示例#4
0
 def get_zhuanfa(self):
     excel = xlwt.Workbook(encoding='utf-8')
     sheet = excel.add_sheet('sheet1')
     sheet.write(0, 0, 'id')
     sheet.write(0, 1, 'name')
     sheet.write(0, 2, 'time')
     sheet.write(0, 3, 'text')
     sheet.write(0, 4, 'likes')
     sheet.write(0, 5, 'loc')
     sheet.write(0, 6, 'sex')
     for m in range(7, 12):
         sheet.write(0, m, '表情' + str(m-6))
     count = 0
     i = 0
     while i <= 101 and count < 5000:  #这个地方改数据条数
         url = 'https://m.weibo.cn/api/comments/show?id=4336744058977011&page=' #这个地方改数字比如4308796748087542
         i = i + 1
         url = url + str(i)
         print(url)
         try:
             response = requests.get(url, headers=headers)
             time.sleep(1)
             resjson = json.loads(response.text)
             time.sleep(1) 
             dataset = resjson.get('data')
             data = dataset.get('data')
             for j in range(0, len(data)):
                 try:
                     temp = data[j]
                     # if temp.get('reply_id') is not None:
                     #     continue
                     user = temp.get('user')
                     text = temp.get('text')
                     created_at = temp.get('created_at')
                     attitudes_count = temp.get('attitudes_count')
                     userid = user.get('id')
                     info_url = "https://m.weibo.cn/api/container/getIndex?containerid=230283" + str(
                         userid) + "_-_INFO"  # 转发人信息的url
                     r = requests.get(info_url)
                     infojson = json.loads(r.text)
                     infodata = infojson.get('data')
                     cards = infodata.get('cards')
                     sex = ''
                     loc = ''
                     for l in range(0, len(cards)):
                         temp = cards[l]
                         card_group = temp.get('card_group')
                         for m in range(0, len(card_group)):
                             s = card_group[m]
                             if s.get('item_name') == '性别':
                                 sex = s.get('item_content')
                             if s.get('item_name') == '所在地':
                                 loc = s.get('item_content')
                                 loc = re.split(r' ', loc)[0]
                     if sex is None:
                         sex = '未知'
                     if loc is None:
                         loc = '未知'
                     screen_name = user.get('screen_name')
                     count += 1
                     if get_emoj(text) is not None:
                         emoji = get_emoj(text)
                         l = 7
                         while emoji:
                             sheet.write(count, l, str(emoji.pop()))
                             l += 1
                     #File.write(text.encode('utf-8') + '\n')
                     sheet.write(count, 0, userid)
                     sheet.write(count, 1, str(screen_name))
                     sheet.write(count, 2, created_at)
                     text = deleteAite(text)
                     text = Tool.replace(text)
                     if text is not None:
                         sheet.write(count, 3, text.encode('utf-8'))
                     sheet.write(count, 4, attitudes_count)
                     sheet.write(count, 5, str(loc))
                     sheet.write(count, 6, str(sex))
                 except Exception as e:
                     print (e)
                 time.sleep(3)
             print ("已经获取" + str(count) + "条数据")
             time.sleep(8)
         except Exception as e:
             print (e)
     # File.close()
     excel.save('单身人群超过2000.xls') #这个地方改文件名称