Exemplo n.º 1
0
    def answer_parse(self, response):
        item = ZhihuItem()
        # item = response.meta.get("item")
        data_list = json.loads(response.body_as_unicode())
        questionID = data_list['data'][0]['question']['id']

        for data in data_list['data']:
            item['question_url'] = 'https://www.zhihu.com/question/' + str(
                questionID)
            item['question_title'] = data['question']['title']  # 问题
            item['answer_url'] = data['url']
            item['answer_content'] = data['content']
            item['answer_voteup_count'] = data['voteup_count']
            item['author_url'] = 'https://www.zhihu.com/people/' + data[
                'author']['url_token'] + '/activities'  # 回答用户链接
            item['author_name'] = data['author']['name']  # 回答用户名称
            item['author_gender'] = data['author']['gender']  # 回答用户名称

            yield item

        # 进行翻页处理
        if not data_list['paging']['is_end']:
            self.answer_offset_dict[questionID] += 5
            print("mydict:", self.answer_offset_dict)
            answer_url = self.answer_url.format(
                questionID, self.answer_offset_dict[questionID])
            yield scrapy.Request(
                url=answer_url,
                callback=self.answer_parse,
                headers=self.headers,
                meta={'cookiejar': response.meta['cookiejar']})
Exemplo n.º 2
0
    def parse_find(self, response):
        try:
            bs_obj = BeautifulSoup(response.body, 'html.parser')
            div_objs = bs_obj.find_all('div', {'class': 'explore-feed feed-item'})
            item = ZhihuItem()
            item['data'] = list()
            item['type'] = 1
            data_offset = 0
            for div_obj in div_objs:
                data = dict()
                attr_obj = div_obj.find('div', {'class': 'zm-item-answer '})
                data['id'] = attr_obj.attrs['data-atoken']
                data['content'] = str(div_obj)
                item['data'].append(data)

                offset = div_obj.attrs['data-offset']
                if offset > data_offset:
                    data_offset = offset
            print '发现:', item['data']
            yield item

            if data_offset > self.find_page:
                self.find_page = int(data_offset)
                sleep(CRAWL_DELAY)
                yield self.__get_finds()
        except AttributeError as e:
            print '获取发现失败:', e.message
Exemplo n.º 3
0
    def parse_item(self, response):
        url = response.url
        print url
        nameinfo = response.xpath(
            '//div[@class="ProfileHeader-contentHead"]/h1/span/text()'
        ).extract()
        priseinfo = response.xpath('//div[@class="IconGraf"]/text()').extract()
        followinfo = response.xpath(
            '//div[@class="NumberBoard-value"]/text()').extract()
        shoucanginfo = response.xpath(
            '//div[@class="Profile-sideColumnItemValue"]/text()').extract()
        #edu = response.xpath('//svg[@class="Icon--education"]/parent::*/parent::div/text()').extract()
        sextype = 0 if response.xpath(
            '//svg[contains(@class, "Icon--female")]') else 1
        name = ""
        desc = ""
        prise = ""
        strprise = ""
        shoucang = ""
        strshou = ""
        following = ""
        follower = ""
        nprise = 0
        nganxie = 0
        nshoucang = 0
        nfollowing = 0
        nfollower = 0

        if len(nameinfo) == 2:
            name = nameinfo[0].encode('utf-8')
            desc = nameinfo[1].encode('utf-8')
            print name
        if len(priseinfo) == 1:
            prise = priseinfo[0].encode('utf-8')
            strprise = re.findall('\d+', prise)
            if strprise is not None:
                nprise = int(strprise[0])
        if len(shoucanginfo) == 1:
            shoucang = shoucanginfo[0].encode('utf-8')
            nshou = re.findall('\d+', shoucang)
            if nshou is not None:
                nganxie = int(nshou[0])
                nshoucang = int(nshou[1])
        if len(followinfo) == 2:
            nfollowing = int(followinfo[0])
            nfollower = int(followinfo[1])

        if len(nameinfo) == 2:
            item = ZhihuItem()
            item['url'] = url
            item['name'] = name
            item['desc'] = desc
            item['sextype'] = sextype
            item['prise'] = nprise
            item['ganxie'] = nganxie
            item['shoucang'] = nshoucang
            item['following'] = nfollowing
            item['follower'] = nfollower
            yield item
Exemplo n.º 4
0
 def parse(self, response):
     print(response.body)
     item = ZhihuItem()
     for sel in response.xpath(
             '//*[@id="root"]/div/main/div/div/div[1]/div[2]/div/div/div/div[2]'
     ):
         item['actor_image'] = sel.xpath('text()').extact()
         print(sel.xpath('text()'))
     yield item
Exemplo n.º 5
0
 def parse_user(self, response):
     with open('lkf.txt', 'w') as f:
         f.write(response.text)
     result = json.loads(response.text)
     item = ZhihuItem()
     for field in item.fields:
         if field in result.keys():
             item[field] = result.get(field)
     yield item
Exemplo n.º 6
0
 def parse(self, response):
     problem = Selector(response)
     item = ZhihuItem()
     item['url'] = response.url
     item['name'] = problem.xpath('//span[@class="name"]/text()').extract()
     print (item['name'])
     item['title'] = problem.xpath('//h2[@class="zm-item-title zm-editable-content"]/text()').extract()
     item['description'] = problem.xpath('//div[@class="zm-editable-content"]/text()').extract()
     item['answer']= problem.xpath('//div[@class=" zm-editable-content clearfix"]/text()').extract()
     return item
Exemplo n.º 7
0
    def user_parse(self, response):  # 获取单个页面的详细信息
        results = json.loads(response.text)
        item = ZhihuItem()

        for field in item.fields:
            if field in results.keys():
                item[field] = results.get(field)
                user_token = results.get('url_token')
        yield item
        yield Request(self.follower_url.format(follower_name=user_token, include=self.follower_query,offset =0,limit =20), callback=self.follower_parse)
Exemplo n.º 8
0
 def parse_comment(self, response):
     item = ZhihuItem()
     jd = json.loads(response.text)
     name = []
     content = []
     for comment in jd['data']:
         name.append(comment['author']['member']['name'])
         content.append(comment['content'])
     item['comment_content'] = dict(zip(name, content))
     yield item
Exemplo n.º 9
0
 def parse_next(self, next_content):
     pages = next_content.xpath('.//div[@class="feed-content"]')
     item = ZhihuItem()
     for page in pages:
         item['title'] = page.xpath(
             './/h2[@class="feed-title"]/a/text()').extract()
         item['content'] = page.xpath(
             './/div[@class="zh-summary summary clearfix"]/text()').extract(
             )
     return item
Exemplo n.º 10
0
    def user_parse(self, response):
        user_item = ZhihuItem()
        results = json.loads(response.text)
        for field in user_item.fields:
            if field in results.keys():
                user_item[field] = results.get(field)
        yield user_item

        # 开始解析用户所有关注列表用户,粉丝列表
        yield Request(url=self.follows_url.format(user=results.get('url_token'), include=self.follows_include, limit=20, offset=0), callback=self.follow_parse)
        yield Request(url=self.followers_url.format(user=results.get('url_token'), include=self.followers_include, limit=20, offset=0), callback=self.followers_parse)
Exemplo n.º 11
0
 def parse(self, response):
     titles = response.xpath(
         '//*[@id="TopicMain"]/div[3]/div/div/div//div[@class="List-item TopicFeedItem"]//h2/text()'
     ).extract()
     for title in titles:
         count += 1
         zh_item = ZhihuItem()
         zh_item['title'] = title
         print(zh_item)
         yield zh_item
     print('获得', count, '个数据')
Exemplo n.º 12
0
 def parse(self, response):
     self.__log.info('parse')
     html_doc = response.body
     soup = BeautifulSoup(html_doc, 'lxml')
     filename = 'output/zhihu.question'
     with open(filename, 'wb') as f:
         f.write(response.body)
     item = ZhihuItem()
     item['title'] = soup.title.string
     item['question'] = soup.find_all('a', class_='question_link')
     item['corp'] = soup.find_all('span', class_='corp')
     return item
Exemplo n.º 13
0
    def get_question_title(self, response):
        line = response.body.replace("\\\"", "\\\\\\\"")
        title = json.loads(line).get('title')

        zhihuItem = ZhihuItem()
        zhihuItem['KeyWord'] = response.meta['key_word']
        zhihuItem['Topic_name'] = response.meta['Topic_name']
        zhihuItem['Topic_id'] = response.meta['topic_id']
        zhihuItem['Question_id'] = response.meta['Question_id']
        zhihuItem['Question_content'] = title
        zhihuItem['Content'] = response.meta['content']
        yield zhihuItem
Exemplo n.º 14
0
    def parse(self, response):
        #see the page testing if loggedin by using cookies above
        #with open("zhihu.com", "wb") as f:
        #f.write(response.body)

        #deserialize the json response
        response_json = json.loads(response.text)

        if not response_json["paging"]["is_end"]:
            yield scrapy.http.Request(response_json["paging"]["next"],
                                      callback=self.parse,
                                      headers=self.headers,
                                      cookies=self.cookies)
        if response_json["data"]:
            for data in response_json["data"]:
                if data["url_token"]:
                    yield scrapy.http.Request(url_to_crawl.format(
                        data["url_token"]),
                                              callback=self.parse,
                                              headers=self.headers,
                                              cookies=self.cookies)
                    zhihu_itemloader = ItemLoader(item=ZhihuItem(),
                                                  response=response)
                    zhihu_itemloader.add_value("is_followed",
                                               data["is_followed"])
                    zhihu_itemloader.add_value("avatar_url_template",
                                               data["avatar_url_template"])
                    zhihu_itemloader.add_value("user_type", data["user_type"])
                    zhihu_itemloader.add_value("answer_count",
                                               data["answer_count"])
                    zhihu_itemloader.add_value("is_following",
                                               data["is_following"])
                    zhihu_itemloader.add_value("url", data["url"])
                    zhihu_itemloader.add_value("url_token", data["url_token"])
                    zhihu_itemloader.add_value("id_", data["id"])
                    zhihu_itemloader.add_value("articles_count",
                                               data["articles_count"])
                    zhihu_itemloader.add_value("name", data["name"])
                    zhihu_itemloader.add_value("headline", data["headline"])
                    zhihu_itemloader.add_value("type_", data["type"])
                    zhihu_itemloader.add_value("is_advertiser",
                                               data["is_advertiser"])
                    zhihu_itemloader.add_value("avatar_url",
                                               data["avatar_url"])
                    zhihu_itemloader.add_value("is_org", data["is_org"])
                    zhihu_itemloader.add_value("gender", data["gender"])
                    zhihu_itemloader.add_value("follower_count",
                                               data["follower_count"])
                    zhihu_itemloader.add_value("badge", data["badge"])
                    zhihu_itemloader.add_value(
                        "crawl_time",
                        datetime.now().strftime("%Y-%m-%d %H:%M:%S"))
                    yield zhihu_itemloader.load_item()
Exemplo n.º 15
0
    def parse(self,response):
        results=json.loads(response.text)
        #print(response.text)
        item=ZhihuItem()
        for field in item.fields:
            if field in results.keys():
                item[field] = results.get(field)
        yield item

        yield scrapy.Request(self.followees_url.format(results.get('url_token'),self.followees_include),
                             self.parse_followees)
        yield scrapy.Request(self.followers_url.format(results.get('url_token'), self.followers_include),
                             self.parse_followers)
Exemplo n.º 16
0
 def parse_user(self, response):
     result = json.loads(response.text)
     item = ZhihuItem()
     for field in item.fields:
         if field in result.keys():
             item[field] = result.get(field)
     yield item
     yield scrapy.Request(url=self.follow_url.format(
         user=result.get('url_token'),
         include=self.follow_include,
         offset=0,
         limit=20),
                          callback=self.parse_follows)
Exemplo n.º 17
0
    def parse(self, response):

        # print(response.body.decode("utf-8", "ignore"))
        print(response.body.decode("utf-8", "ignore"))
        response_data = json.loads(response.body.decode("utf-8",
                                                        "ignore"))["data"]
        # print(len(response_data))
        # print('**'*20)
        count = len(response_data)
        if count < 20:
            pass
        else:
            page_offset = int(re.findall("&offset=(.*?)&", response.url)[0])
            new_page_offset = page_offset + 20
            next_page_url = response.url.replace(
                "&offset=" + str(page_offset) + "&",
                "&offset=" + str(new_page_offset) + "&")
            yield scrapy.Request(url=next_page_url, callback=self.parse)

        for eve_user in response_data:
            item = ZhihuItem()
            item["name"] = eve_user["name"]
            item["is_advertiser"] = eve_user["is_advertiser"]
            item["avatar_url_template"] = eve_user["avatar_url_template"]
            item["user_type"] = eve_user["user_type"]
            item["answer_count"] = eve_user["answer_count"]
            item["type"] = eve_user["type"]
            item["url_token"] = eve_user["url_token"]
            item["user_id"] = eve_user["id"]
            item["articles_count"] = eve_user["articles_count"]
            item["url"] = eve_user["url"]
            item["gender"] = eve_user["gender"]
            item["headline"] = eve_user["headline"]
            item["avatar_url"] = eve_user["avatar_url"]
            item["is_org"] = eve_user["is_org"]
            item["follower_count"] = eve_user["follower_count"]

            #去重
            with open("userinfor.txt") as f:
                user_list = f.read()

            if eve_user["url_token"] not in user_list:
                with open("userinfor.txt", "a") as f:
                    f.writable()
                    f.write(eve_user["url_token"] + "----")

            yield item

            new_url = "https://www.zhihu.com/api/v4/members/" + eve_user[
                "url_token"] + "/followers?include=data[*].answer_count,articles_count,gender,follower_count,is_followed,is_following,badge[?(type=best_answerer)].topics&offset=20&limit=20"
            yield scrapy.Request(url=new_url, callback=self.parse)
Exemplo n.º 18
0
    def get_user_info(self, response):  #获取用户信息信息

        data = json.loads(response.text)
        #print(data)
        item = ZhihuItem()
        for Field in item.fields:  #可以获取在item里面定义的key值,就是那些locations,employments等
            #print(Field)
            if Field in data.keys():

                if Field == 'educations':
                    try:
                        item[Field] = ','.join([
                            data.get(Field)[i].get('school').get('name')
                            for i in range(len(data.get(Field)))
                        ])
                    except:
                        item[Field] = ''
                elif Field == 'business':
                    try:
                        item[Field] = data.get(Field).get('name')
                    except:
                        item[Field] = ''
                elif Field == 'employments':
                    try:
                        item[Field] = ','.join([
                            data.get(Field)[i].get('job').get('name')
                            for i in range(len(data.get(Field)))
                        ])
                    except:
                        item[Field] = ''
                elif Field == 'locations':
                    try:
                        item[Field] = data.get(Field)[0].get('name')
                    except:
                        item[Field] = ''
                else:
                    item[Field] = data.get(Field)  # 获取字典里面的值
        yield item
        yield scrapy.Request(url=self.followers_url.format(
            user_name=data.get('url_token'),
            include_follow=self.include_follow,
            offset=0,
            limit=20),
                             callback=self.get_followers_parse)
        yield scrapy.Request(url=self.followees_url.format(
            user_name=data.get('url_token'),
            include_follow=self.include_follow,
            offset=0,
            limit=20),
                             callback=self.get_followees_parse)
Exemplo n.º 19
0
 def parse_homepage(self, response):
     divs = response.xpath(
         '//div[starts-with(@class,"feed-item folding feed-item-hook feed-item")]'
     )
     for div in divs:
         items = ZhihuItem()
         label = div.xpath(
             'div[@class ="feed-main"]/div[@class= "feed-source"]/a/text()'
         ).extract()
         print label
         #items['label'] = label[0]  if label else ''
         #items['title'] = div.xpath('./div/div/div/h2[@class ="feed-title"]/a/text()').extract()[0]
         #item['author'] = div.xpath('./div/div/div/div/div/span/span[@class = "author-link"]/a/text()').extract()[0]
         yield items
Exemplo n.º 20
0
    def parse_user(self, response):
        #print(response.text)
        result = json.loads(response.text)
        item = ZhihuItem()

        for field in item.fields:
            if field in result.keys():
                item[field] = result.get(field)
        yield item
        yield Request(
            self.follows_url.format(user=result.get('url_token'),
                                    include=self.follows_query,
                                    limit=20,
                                    offset=0), self.parse_follows)
Exemplo n.º 21
0
 def parse(self, response):
     item = ZhihuItem()
     print(response)
     item["title"] = response.xpath(
         "//meta[@itemprop='name']/@content").extract()
     item["content"] = response.xpath(
         "//span[@itemprop='text']/text()").extract()
     item["link"] = response.xpath(
         "//a[@data-za-detail-view-id='3942']/@href").extract()
     #item["like"] = response.xpath("//button[@class='Button VoteButton VoteButton--up']/text()").extract()
     #item["dislike"] = response.xpath("//button[@class='Button VoteButton VoteButton--down']/text()/").extract()
     #item["comment"] = response.xpath("//button[@]class='Button ContentItem-action Button--plain Button--withIcon Button--withLabe']/text()").extract()
     print(item['title'], item['content'])
     #yield item
     '''
Exemplo n.º 22
0
 def parse(self, response):
     item = ZhihuItem()
     self.xsrf = response.xpath(
         './/input[@name="_xsrf"]/@value').extract()[0]
     self.i = 0
     pages = response.xpath('.//div[@class="feed-content"]')
     for page in pages:
         item['title'] = page.xpath(
             './/h2[@class="feed-title"]/a/text()').extract()
         item['content'] = page.xpath(
             './/div[@class="zh-summary summary clearfix"]/text()').extract(
             )
     for j in range(50):
         yield self.next10()
         time.sleep(0.5)
Exemplo n.º 23
0
 def parse(self, response):
     sel = scrapy.selector.Selector(response)
     # names = sel.xpath('//div/div/div/div/ul/li/div/div/p[@class="p1"]/a/text()').extract()
     # zans = sel.xpath('//div/div/div/div/ul/li/div/div/p/span/text()').extract()
     # titles = sel.xpath('//div/div/div/div/ul/li/p/a/text()').extract()
     sites = sel.xpath('//div/div/div/div/ul[@class="news-list"]/li')
     items = []
     for site in sites:
         item = ZhihuItem()
         item['name'] = site.xpath(
             'div/div/p[@class="p1"]/a/text()').extract()
         item['zan'] = site.xpath('div/div/p/span/text()').extract()
         item['title'] = site.xpath('p/a/text()').extract()
         items.append(item)
     return items
Exemplo n.º 24
0
 def parse_user(self, request):
     item = ZhihuItem()
     results = json.loads(request.text)
     for field in item.fields:
         if field in results.keys():
             item[field] = results.get(field)
     yield item
     yield scrapy.Request(
         self.followings_url %
         (results.get('url_token'), self.followings_include),
         callback=self.parse_followings)
     yield scrapy.Request(
         self.followers_url %
         (results.get('url_token'), self.followers_include),
         callback=self.parse_followers)
Exemplo n.º 25
0
 def parse_user(self, response):
     result = json.loads(response.text)
     item = ZhihuItem()
     result_keys = result.get('data')[0].get('actor').keys()
     result_key = result.get('data')[0].get('actor')
     for field in item.fields:
         if field in result_keys:
             item[field] = result_key.get(field)
     yield item
     #yield Request(self.user_url.format(user=result.get('url_token'),include=self.user_query,offset=0,limit=20),meta={'proxy':'47.112.222.108:8000'},callback=self.parse)
     yield Request(self.user_url.format(user=result.get('url_token'),
                                        include=self.user_query,
                                        offset=0,
                                        limit=20),
                   callback=self.parse)
Exemplo n.º 26
0
    def parse_user(self, response):
        result = json.loads(response.text)
        item = ZhihuItem()

        for field in item.fields:

            if field in result.keys():
                item[field] = result.get(field)

        yield item

        # 递归关注
        yield scrapy.Request(
            self.follows_url.format(user_token=result.get('url_token')),
            callback=self.parse_follows,
            meta={'cookiejar': response.meta['cookiejar']})
Exemplo n.º 27
0
    def parse_user(self, response):
        # print(response.text)
        result = json.loads(response.text)
        item = ZhihuItem()

        for field in item.fields:
            if field in result.keys():
                item[field] = result.get(field)
        yield item
        #获取关注列表的关注列表
        yield scrapy.Request(url=self.followee_url.format(
            user=result.get("url_token"),
            include=self.followee_query,
            offset=0,
            limit=20),
                             callback=self.parse_followee)
Exemplo n.º 28
0
    def parse(self, response):
        # print response.meta["cookies"]

        for each in Selector(response).xpath(
                r'//div[@class="Card AnswerCard"] | //div[@class="Card MoreAnswers"]'
        ):
            item = ZhihuItem()
            item["authorName"] = \
            Selector(each).xpath(r'.//span[@class="UserLink AuthorInfo-name"]//a/text()').extract()[0]

            item["authorUrl"] = \
            Selector(each).xpath(r'.//div[@class="ContentItem-meta"]//meta[@itemprop="url"]/@content').extract()[0]
            item["url"] = response.url

            yield item
            print '11111'
Exemplo n.º 29
0
    def parse_item(self, response):
        # print response.body
        selector = Selector(response)

        # print 'Start extract data from response ..............'
        items = []
        for elem in selector.xpath(
                '//div[@class="feed-content"]/h2[@class="feed-title"]'):
            item = ZhihuItem()
            item['title'] = elem.xpath('a/text()').extract()
            item['link'] = elem.xpath('a/@href').extract()
            items.append(item)

            print(item['title'].decode())
            print(item['link'].decode())
        # print 'Finish extract data........................'
        return items
Exemplo n.º 30
0
 def parse_user(self, response):
     print(response.text)
     result = json.loads(response.text)
     #print(result)
     item = ZhihuItem()
     print('输出结果')
     for field in item.fields:
         if field in result.keys():
             item[field] = result.get(field)
     yield (item)
     yield Request(self.follow_url.format(user=result.get('url_token'),
                                          include=self.follow_query,
                                          offset=0),
                   callback=self.parse_follow)
     yield Request(self.follow_url.format(user=result.get('url_token'),
                                          include=self.follow_query,
                                          offset=0),
                   callback=self.parse_follower)