Exemplo n.º 1
0
    def parse_user(self, response):
        """
        解析用户的基本信息
        返回数据是json,所以对json解析(反序列化)
        :param response:
        :return:
        """
        result = json.loads(response.text)
        item = ZhihuUserItem()  # 实例化Item
        for field in item.fields:  # 遍历Item字段,并且对应赋值
            if field in result.keys():
                item[field] = result.get(field, '')
            if 'id' in result.keys():
                item['userid'] = result.get(
                    'id', '')  # json中是id,Item中定义是userid,所以要特殊处理
            item['updatetime'] = datetime.datetime.now().isoformat(' ')

        yield item
        # print(item)

        # 解析出关注列表请求URL放入调度器
        yield scrapy.Request(url=self.followees_url.format(
            username=result.get('url_token'),
            followees_query=self.followees_query,
            offset=0,
            limit=20),
                             callback=self.parse_followees)

        # 解析出关注他的人(粉丝)列表请求URL放入调度器
        yield scrapy.Request(url=self.fans_url.format(
            username=result.get('url_token'),
            fans_query=self.fans_query,
            offset=0,
            limit=20),
                             callback=self.parse_fans)
Exemplo n.º 2
0
    def parse_user(self,response):
        #user数据的解析
        item_loader = ItemLoader(item=ZhihuUserItem(), response=response)
        vote_pat = '<div class="IconGraf".*?</div>.*?获得.*?>([0-9,]+)<.*?次赞同.*?</div>'
        thanks_pat = '次赞同.*?<div.*?获得(.*?)次感谢'
        collection_pat = '次感谢.*?>([0-9,]+).*?次收藏'
        get_vote_num = re.compile(vote_pat,re.DOTALL).findall(response.text)
        get_thanks = re.compile(thanks_pat, re.DOTALL).findall(response.text)
        get_collection = re.compile(collection_pat, re.DOTALL).findall(response.text)
        answer_num = response.xpath('//li[@aria-controls="Profile-answers"]/a/span/text()').extract_first()
        inf_pat = '<div class="ProfileHeader-iconWrapper".*?</div>.*?>(.*?)<.*?(</div>|<div>)'
        code_pat = '<div class="ProfileHeader-iconWrapper".*?<svg.*?<path d="(.*?)".*?>'
        inf= re.compile(inf_pat,re.DOTALL).findall(response.text)
        code = re.compile(code_pat,re.DOTALL).findall(response.text)
        if inf and code is not None:
            inf_code = list(zip([i[0] for i in inf],code))
            for i in inf_code:
                if i[1] == MAN_CODE:
                    sex = "男"
                    item_loader.add_value('sex', sex)
                elif i[1] == WOMAN_CODE:
                    sex = "女"
                    item_loader.add_value('sex', sex)
                elif i[1] == JOB_CODE:
                    career = i[0]
                    item_loader.add_value('career', career)
                elif i[1] ==EDUCATION:
                    edu = i[0]
                    item_loader.add_value('educational_experience', edu)

        item_loader.add_value('user_url',response.url)
        item_loader.add_value('url_id',get_md5(response.url))
        item_loader.add_xpath('name','//h1[@class="ProfileHeader-title"]/span[1]/text()')
        item_loader.add_xpath('introduce_yourself','//h1[@class="ProfileHeader-title"]/span[2]/text()')
        item_loader.add_value('get_vote_num',get_vote_num)
        item_loader.add_value('get_thanks',get_thanks)
        item_loader.add_value('get_collection',get_collection)
        item_loader.add_xpath('followers','//div[@class="Card FollowshipCard"]/div/a[2]/div/strong/text()')
        item_loader.add_xpath('following','//div[@class="Card FollowshipCard"]/div/a[1]/div/strong/text()')
        item_loader.add_value('answer_num',answer_num)
        item_loader.add_xpath('questions_num','//li[@aria-controls="Profile-asks"]/a/span/text()')
        item_loader.add_xpath('articles_num','//li[@aria-controls="Profile-posts"]/a/span/text()')
        item_loader.add_xpath('columns_num','//li[@aria-controls="Profile-columns"]/a/span/text()')
        item_loader.add_xpath('ideal_num', '//li[@aria-controls="Profile-pins"]/a/span/text()')

        author_item = item_loader.load_item()
        yield author_item

        author_id = response.meta.get("author_id","")
        if author_id is not None and answer_num is not None and answer_num !='0':
            user_answer_url = self.user_answer_api.format(author_id,0,20)
            yield Request(url=user_answer_url,callback=self.recursion_question)
Exemplo n.º 3
0
    def parse_user(self, response):
        result = json.loads(response.text)
        item = ZhihuUserItem()

        for field in item.fields:
            if field in result.keys():
                item[field] = result.get(field)
        yield item

        yield Request(
            self.follows_url.format(user=result.get('url_token'),
                                    include=self.follows_query,
                                    limit=20,
                                    offset=0), self.parse_follows)

        yield Request(
            self.followers_url.format(user=result.get('url_token'),
                                      include=self.followers_query,
                                      limit=20,
                                      offset=0), self.parse_followers)
Exemplo n.º 4
0
    def parse_user(self, response):
        selector = Selector(response)
        user = ZhihuUserItem()
        user['_id'] = user['username'] = response.url.split('/')[-2]
        user['url'] = response.url
        user['nickname'] = ''.join(
            selector.xpath(
                "//div[@class='title-section ellipsis']/a[@class='name']/text()"
            ).extract())
        user['location'] = ''.join(
            selector.xpath("//span[@class='location item']/@title").extract())
        user['industry'] = ''.join(
            selector.xpath("//span[@class='business item']/@title").extract())
        user['sex'] = ''.join(
            selector.xpath(
                '//div[@class="item editable-group"]/span/span[@class="item"]/i/@class'
            ).extract()).replace("zg-icon gender ", "")
        user['description'] = ''.join(
            selector.xpath(
                "//span[@class='description unfold-item']/span/text()").
            extract()).strip().replace("\n", '')
        user['view_num'] = ''.join(
            selector.xpath(
                "//span[@class='zg-gray-normal']/strong/text()").extract())
        user['update_time'] = str(datetime.now())

        user['jobs'] = []
        job_nodes = selector.xpath(
            '//div[@class="zm-profile-module zg-clear"][1]/div/ul[@class="zm-profile-details-items"]/li'
        )
        for node in job_nodes:
            company = ''.join(node.xpath('@data-title').extract())
            title = ''.join(node.xpath('@data-sub-title').extract())
            user['jobs'].append({'company': company, 'title': title})

        user['educations'] = []
        edu_nodes = selector.xpath(
            '//div[@class="zm-profile-module zg-clear"][3]/div/ul[@class="zm-profile-details-items"]/li'
        )
        for node in edu_nodes:
            school = ''.join(node.xpath('@data-title').extract())
            major = ''.join(node.xpath('@data-sub-title').extract())
            user['educations'].append({'school': school, 'major': major})

        user['sinaweibo'] = ''
        user['tencentweibo'] = ''
        for node in selector.xpath(
                "//a[@class='zm-profile-header-user-weibo']/@href").extract():
            if node.startswith('http://weibo.com'):
                user['sinaweibo'] = node
            elif node.startswith('http://t.qq.com'):
                user['tencentweibo'] = node

        statistics = selector.xpath(
            "//a[@class='item']/strong/text()").extract()
        followee_num = user['followee_num'] = statistics[0]
        follower_num = user['follower_num'] = statistics[1]

        statistics = selector.xpath(
            "//div[@class='zm-profile-module-desc']/span/strong/text()"
        ).extract()
        if len(statistics) == 4:
            user['agree_num'] = statistics[0]
            user['thank_num'] = statistics[1]
            user['fav_num'] = statistics[2]
            user['share_num'] = statistics[3]

        statistics = selector.xpath(
            "//div[@class='profile-navbar clearfix']/a/span/text()").extract()
        if len(statistics) == 6:
            user['ask_num'] = statistics[1]
            user['answer_num'] = statistics[2]
            user['post_num'] = statistics[3]
            user['collection_num'] = statistics[4]
            user['log_num'] = statistics[5]

        _xsrf = ''.join(
            selector.xpath('//input[@name="_xsrf"]/@value').extract())
        hash_id = ''.join(
            selector.xpath(
                '//div[@class="zm-profile-header-op-btns clearfix"]/button/@data-id'
            ).extract())

        yield user
        self.user_names.append(user['username'])
        print 'NEW:%s' % user['username']

        num = int(followee_num) if followee_num else 0
        page_num = num / 20
        page_num += 1 if num % 20 else 0
        for i in xrange(page_num):
            params = json.dumps({
                "hash_id": hash_id,
                "order_by": "created",
                "offset": i * 20
            })
            payload = {"method": "next", "params": params, "_xsrf": _xsrf}
            yield Request("http://www.zhihu.com/node/ProfileFolloweesListV2?" +
                          urlencode(payload),
                          callback=self.parse_follow_url)

        num = int(follower_num) if follower_num else 0
        page_num = num / 20
        page_num += 1 if num % 20 else 0
        for i in xrange(page_num):
            params = json.dumps({
                "hash_id": hash_id,
                "order_by": "created",
                "offset": i * 20
            })
            payload = {"method": "next", "params": params, "_xsrf": _xsrf}
            yield Request("http://www.zhihu.com/node/ProfileFollowersListV2?" +
                          urlencode(payload),
                          callback=self.parse_follow_url)
Exemplo n.º 5
0
    def parse_user(self, response):
        print 'parsing user: %s' % response.url
        selector = Selector(response)
        user = ZhihuUserItem()
        if(response.url.endswith('about')):
            user['_id'] = user['username'] = response.url.split('/')[-2]
        else:
            user['_id'] = user['username'] = response.url.split('/')[-1]
        user['url'] = response.url
        user['nickname'] = ''.join(
            selector.xpath("//div[@class='title-section ellipsis']/a[@class='name']/text()").extract())
        user['location'] = ''.join(selector.xpath("//span[@class='location item']/@title").extract())
        user['industry'] = ''.join(selector.xpath("//span[@class='business item']/@title").extract())
        user['sex'] = ''.join(
            selector.xpath('//div[@class="item editable-group"]/span/span[@class="item"]/i/@class').extract()).replace(
            "zg-icon gender ", "")
        user['description'] = ''.join(
            selector.xpath("//span[@class='description unfold-item']/span/text()").extract()).strip().replace("\n", '')
        user['view_num'] = ''.join(selector.xpath("//span[@class='zg-gray-normal']/strong/text()").extract())
        user['update_time'] = str(datetime.now())

        user['jobs'] = []
        job_nodes = selector.xpath(
            '//div[@class="zm-profile-module zg-clear"][1]/div/ul[@class="zm-profile-details-items"]/li')
        for node in job_nodes:
            company = ''.join(node.xpath('@data-title').extract())
            title = ''.join(node.xpath('@data-sub-title').extract())
            user['jobs'].append({'company': company, 'title': title})

        user['educations'] = []
        edu_nodes = selector.xpath(
            '//div[@class="zm-profile-module zg-clear"][3]/div/ul[@class="zm-profile-details-items"]/li')
        for node in edu_nodes:
            school = ''.join(node.xpath('@data-title').extract())
            major = ''.join(node.xpath('@data-sub-title').extract())
            user['educations'].append({'school': school, 'major': major})

        user['sinaweibo'] = ''
        user['tencentweibo'] = ''
        for node in selector.xpath("//a[@class='zm-profile-header-user-weibo']/@href").extract():
            if node.startswith('http://weibo.com'):
                user['sinaweibo'] = node
            elif node.startswith('http://t.qq.com'):
                user['tencentweibo'] = node

        statistics = selector.xpath("//a[@class='item']/strong/text()").extract()
        followee_num = user['followee_num'] = statistics[0]
        follower_num = user['follower_num'] = statistics[1]

        statistics = selector.xpath("//div[@class='zm-profile-module-desc']/span/strong/text()").extract()
        if len(statistics) == 4:
            user['agree_num'] = statistics[0]
            user['thank_num'] = statistics[1]
            user['fav_num'] = statistics[2]
            user['share_num'] = statistics[3]

        statistics = selector.xpath("//div[@class='profile-navbar clearfix']/a/span/text()").extract()
        if len(statistics) == 6:
            user['ask_num'] = statistics[1]
            user['answer_num'] = statistics[2]
            user['post_num'] = statistics[3]
            user['collection_num'] = statistics[4]
            user['log_num'] = statistics[5]

        _xsrf = ''.join(selector.xpath('//input[@name="_xsrf"]/@value').extract())
        hash_id = ''.join(
            selector.xpath('//div[@class="zm-profile-header-op-btns clearfix"]/button/@data-id').extract())



        # questions
        num = int(user['ask_num']) if "ask_num" in user.keys() else 0
        page_num = num / 20
        page_num += 1 if num % 20 else 0
        for i in xrange(page_num):
            url = host + "/people/" + user["username"] + '/asks?page=%d' % (i + 1)
            yield Request(url, callback=self.parse_ask)

        # answers
        num = int(user['answer_num']) if "answer_num" in user.keys()  else 0
        page_num = num / 20
        page_num += 1 if num % 20 else 0
        for i in xrange(page_num):
            yield Request(host + "/people/" + user["username"] + '/answers?page=%d' % (i + 1),
                          callback=self.parse_ans)

        self.user_names.append(user['username'])
        print 'User parsed: %s' % user['username']
        yield user

        num = int(followee_num) if followee_num else 0
        page_num = num / 20
        page_num += 1 if num % 20 else 0
        for i in xrange(page_num):
            params = json.dumps({"hash_id": hash_id, "order_by": "created", "offset": i * 20})
            payload = {"method": "next", "params": params, "_xsrf": _xsrf}
            yield Request("http://www.zhihu.com/node/ProfileFolloweesListV2?" + urlencode(payload),
                          callback=self.parse_follow_url)

        num = int(follower_num) if follower_num else 0
        page_num = num / 20
        page_num += 1 if num % 20 else 0
        for i in xrange(page_num):
            params = json.dumps({"hash_id": hash_id, "order_by": "created", "offset": i * 20})
            payload = {"method": "next", "params": params, "_xsrf": _xsrf}
            yield Request("http://www.zhihu.com/node/ProfileFollowersListV2?" + urlencode(payload),
                          callback=self.parse_follow_url)
Exemplo n.º 6
0
    def parse(self, response):
        time.sleep(random.random())

        if response.url == host:
            yield Request("http://www.zhihu.com/people/raymond-wang/about", headers = self.headers, cookies = self.cookies)
        else:
            typeinfo = response.url.split('/')[-1]
            selector = Selector(response)

            if typeinfo.startswith('about'):
                try:
                    user = ZhihuUserItem()
                    user['_id']=user['username']=response.url.split('/')[-2]
                    user['url']= response.url
                    user['nickname'] = ''.join(selector.xpath("//div[@class='title-section ellipsis']/a[@class='name']/text()").extract())
                    user['location'] = ''.join(selector.xpath("//span[@class='location item']/@title").extract())
                    user['industry'] = ''.join(selector.xpath("//span[@class='business item']/@title").extract())
                    user['sex'] = ''.join(selector.xpath('//div[@class="item editable-group"]/span/span[@class="item"]/i/@class').extract()).replace("zg-icon gender ","")
                    user['description'] = ''.join(selector.xpath("//span[@class='description unfold-item']/span/text()").extract()).strip().replace("\n",'')
                    user['view_num'] = ''.join(selector.xpath("//span[@class='zg-gray-normal']/strong/text()").extract())
                    user['update_time'] = str(datetime.now())

                    user['jobs'] = []
                    job_nodes = selector.xpath('//div[@class="zm-profile-module zg-clear"][1]/div/ul[@class="zm-profile-details-items"]/li')
                    for node in job_nodes:
                        company = ''.join(node.xpath('@data-title').extract())
                        title = ''.join(node.xpath('@data-sub-title').extract())
                        user['jobs'].append({'company': company, 'title':title})

                    user['educations'] = []
                    edu_nodes = selector.xpath('//div[@class="zm-profile-module zg-clear"][3]/div/ul[@class="zm-profile-details-items"]/li')
                    for node in edu_nodes:
                        school = ''.join(node.xpath('@data-title').extract())
                        major = ''.join(node.xpath('@data-sub-title').extract())
                        user['educations'].append({'school':school, 'major':major})

                    for node in selector.xpath("//a[@class='zm-profile-header-user-weibo']/@href").extract():
                        if node.startswith('http://weibo.com'):
                            user['sinaweibo'] = node
                        elif node.startswith('http://t.qq.com'):
                            user['tencentweibo'] = node

                    statistics = selector.xpath("//a[@class='item']/strong/text()").extract()
                    followee_num =user['followee_num'] = statistics[0]
                    follower_num = user['follower_num']= statistics[1]

                    statistics = selector.xpath("//div[@class='zm-profile-module-desc']/span/strong/text()").extract()
                    if len(statistics) ==4:
                        user['agree_num'] = statistics[0]
                        user['thank_num'] = statistics[1]
                        user['fav_num'] = statistics[2]
                        user['share_num'] = statistics[3]

                    statistics = selector.xpath("//div[@class='profile-navbar clearfix']/a/span/text()").extract()
                    if len(statistics) ==6:
                        user['ask_num'] = statistics[1]
                        user['answer_num'] = statistics[2]
                        user['post_num'] = statistics[3]
                        user['collection_num'] = statistics[4]
                        user['log_num'] = statistics[5]

                    _xsrf=''.join(selector.xpath('//input[@name="_xsrf"]/@value').extract())
                    hash_id=''.join(selector.xpath('//div[@class="zm-profile-header-op-btns clearfix"]/button/@data-id').extract())

                    print 'NEW:%s' % user['username']

                    yield user
                    self.user_names.append(user['username'])
                    print 'NEW:%s' % user['username']

                    base_url = '/'.join(response.url.split('/')[:-1])
                    headers = self.headers
                    headers['Referer'] = response.url

                    # followees
                    num = int(followee_num) if followee_num else 0
                    page_num = num/20
                    page_num += 1 if num%20 else 0

                    for i in xrange(page_num):
                        params = json.dumps({"hash_id":hash_id,"order_by":"created","offset":i*20})
                        payload = {"method":"next", "params": params, "_xsrf":_xsrf,"username":user['username']}
                        yield Request("http://www.zhihu.com/node/ProfileFolloweesListV2?"+urlencode(payload), headers = headers, cookies = self.cookies)

                    # followers
                    num = int(follower_num) if follower_num else 0
                    page_num = num/20
                    page_num += 1 if num%20 else 0

                    for i in xrange(page_num):
                        params = json.dumps({"hash_id":hash_id,"order_by":"created","offset":i*20})
                        payload = {"method":"next", "params": params, "_xsrf":_xsrf,"username":user['username']}
                        yield Request("http://www.zhihu.com/node/ProfileFollowersListV2?"+urlencode(payload), headers = headers, cookies = self.cookies)

                    '''
                    # questions
                    num = int(user['ask_num']) if user['ask_num'] else 0
                    page_num = num/20
                    page_num += 1 if num%20 else 0
                    for i in xrange(page_num):
                        if i > 0:
                            headers['Referer'] = base_url + '/asks?page=%d' % (i-1)
                        else:
                            headers['Referer'] = base_url + '/asks'
                        yield Request(base_url + '/asks?page=%d' % (i+1), headers = headers, cookies = self.cookies)

                    # answers
                    num = int(user['answer_num']) if user['answer_num'] else 0
                    page_num = num/20
                    page_num += 1 if num%20 else 0
                    for i in xrange(page_num):
                        if i > 0:
                            headers['Referer'] = base_url + '/answers?page=%d' % (i-1)
                        else:
                            headers['Referer'] = base_url + '/answers'

                        yield Request(base_url + '/answers?page=%d' % (i+1), headers = headers, cookies = self.cookies)
                    '''
                except Exception, e:
                    open('error_pages/about_' + response.url.split('/')[-2]+'.html', 'w').write(response.body)
                    print '='*10 + str(e)

            elif typeinfo.startswith('followees') or typeinfo.startswith('ProfileFolloweesListV2'):
                followees = []
                try:
                    links = selector.xpath('//div[@class="zm-list-content-medium"]/h2/a/@href').extract()

                    for link in links:
                        username_tmp = link.split('/')[-1]
                        followees.append(username_tmp)
                        if username_tmp in self.user_names:
                            print 'GET:' + '%s' % username_tmp
                            continue

                        headers = self.headers;
                        headers['Referer'] = response.url
                        yield Request(link+'/about', headers = headers, cookies=self.cookies)

                    username=urlparse.parse_qs(urlparse.urlparse(response.url).query,True)['username'][0]
                    #yield ZhihuFolloweesItem(_id=username,username = username,followees = followees)
                except Exception, e:
                    open('error_pages/followees_' + response.url.split('/')[-2]+'.html', 'w').write(response.body)
                    print '='*10 + str(e)