def answer_parse(self, response): item = ZhihuItem() # item = response.meta.get("item") data_list = json.loads(response.body_as_unicode()) questionID = data_list['data'][0]['question']['id'] for data in data_list['data']: item['question_url'] = 'https://www.zhihu.com/question/' + str( questionID) item['question_title'] = data['question']['title'] # 问题 item['answer_url'] = data['url'] item['answer_content'] = data['content'] item['answer_voteup_count'] = data['voteup_count'] item['author_url'] = 'https://www.zhihu.com/people/' + data[ 'author']['url_token'] + '/activities' # 回答用户链接 item['author_name'] = data['author']['name'] # 回答用户名称 item['author_gender'] = data['author']['gender'] # 回答用户名称 yield item # 进行翻页处理 if not data_list['paging']['is_end']: self.answer_offset_dict[questionID] += 5 print("mydict:", self.answer_offset_dict) answer_url = self.answer_url.format( questionID, self.answer_offset_dict[questionID]) yield scrapy.Request( url=answer_url, callback=self.answer_parse, headers=self.headers, meta={'cookiejar': response.meta['cookiejar']})
def parse_find(self, response): try: bs_obj = BeautifulSoup(response.body, 'html.parser') div_objs = bs_obj.find_all('div', {'class': 'explore-feed feed-item'}) item = ZhihuItem() item['data'] = list() item['type'] = 1 data_offset = 0 for div_obj in div_objs: data = dict() attr_obj = div_obj.find('div', {'class': 'zm-item-answer '}) data['id'] = attr_obj.attrs['data-atoken'] data['content'] = str(div_obj) item['data'].append(data) offset = div_obj.attrs['data-offset'] if offset > data_offset: data_offset = offset print '发现:', item['data'] yield item if data_offset > self.find_page: self.find_page = int(data_offset) sleep(CRAWL_DELAY) yield self.__get_finds() except AttributeError as e: print '获取发现失败:', e.message
def parse_item(self, response): url = response.url print url nameinfo = response.xpath( '//div[@class="ProfileHeader-contentHead"]/h1/span/text()' ).extract() priseinfo = response.xpath('//div[@class="IconGraf"]/text()').extract() followinfo = response.xpath( '//div[@class="NumberBoard-value"]/text()').extract() shoucanginfo = response.xpath( '//div[@class="Profile-sideColumnItemValue"]/text()').extract() #edu = response.xpath('//svg[@class="Icon--education"]/parent::*/parent::div/text()').extract() sextype = 0 if response.xpath( '//svg[contains(@class, "Icon--female")]') else 1 name = "" desc = "" prise = "" strprise = "" shoucang = "" strshou = "" following = "" follower = "" nprise = 0 nganxie = 0 nshoucang = 0 nfollowing = 0 nfollower = 0 if len(nameinfo) == 2: name = nameinfo[0].encode('utf-8') desc = nameinfo[1].encode('utf-8') print name if len(priseinfo) == 1: prise = priseinfo[0].encode('utf-8') strprise = re.findall('\d+', prise) if strprise is not None: nprise = int(strprise[0]) if len(shoucanginfo) == 1: shoucang = shoucanginfo[0].encode('utf-8') nshou = re.findall('\d+', shoucang) if nshou is not None: nganxie = int(nshou[0]) nshoucang = int(nshou[1]) if len(followinfo) == 2: nfollowing = int(followinfo[0]) nfollower = int(followinfo[1]) if len(nameinfo) == 2: item = ZhihuItem() item['url'] = url item['name'] = name item['desc'] = desc item['sextype'] = sextype item['prise'] = nprise item['ganxie'] = nganxie item['shoucang'] = nshoucang item['following'] = nfollowing item['follower'] = nfollower yield item
def parse(self, response): print(response.body) item = ZhihuItem() for sel in response.xpath( '//*[@id="root"]/div/main/div/div/div[1]/div[2]/div/div/div/div[2]' ): item['actor_image'] = sel.xpath('text()').extact() print(sel.xpath('text()')) yield item
def parse_user(self, response): with open('lkf.txt', 'w') as f: f.write(response.text) result = json.loads(response.text) item = ZhihuItem() for field in item.fields: if field in result.keys(): item[field] = result.get(field) yield item
def parse(self, response): problem = Selector(response) item = ZhihuItem() item['url'] = response.url item['name'] = problem.xpath('//span[@class="name"]/text()').extract() print (item['name']) item['title'] = problem.xpath('//h2[@class="zm-item-title zm-editable-content"]/text()').extract() item['description'] = problem.xpath('//div[@class="zm-editable-content"]/text()').extract() item['answer']= problem.xpath('//div[@class=" zm-editable-content clearfix"]/text()').extract() return item
def user_parse(self, response): # 获取单个页面的详细信息 results = json.loads(response.text) item = ZhihuItem() for field in item.fields: if field in results.keys(): item[field] = results.get(field) user_token = results.get('url_token') yield item yield Request(self.follower_url.format(follower_name=user_token, include=self.follower_query,offset =0,limit =20), callback=self.follower_parse)
def parse_comment(self, response): item = ZhihuItem() jd = json.loads(response.text) name = [] content = [] for comment in jd['data']: name.append(comment['author']['member']['name']) content.append(comment['content']) item['comment_content'] = dict(zip(name, content)) yield item
def parse_next(self, next_content): pages = next_content.xpath('.//div[@class="feed-content"]') item = ZhihuItem() for page in pages: item['title'] = page.xpath( './/h2[@class="feed-title"]/a/text()').extract() item['content'] = page.xpath( './/div[@class="zh-summary summary clearfix"]/text()').extract( ) return item
def user_parse(self, response): user_item = ZhihuItem() results = json.loads(response.text) for field in user_item.fields: if field in results.keys(): user_item[field] = results.get(field) yield user_item # 开始解析用户所有关注列表用户,粉丝列表 yield Request(url=self.follows_url.format(user=results.get('url_token'), include=self.follows_include, limit=20, offset=0), callback=self.follow_parse) yield Request(url=self.followers_url.format(user=results.get('url_token'), include=self.followers_include, limit=20, offset=0), callback=self.followers_parse)
def parse(self, response): titles = response.xpath( '//*[@id="TopicMain"]/div[3]/div/div/div//div[@class="List-item TopicFeedItem"]//h2/text()' ).extract() for title in titles: count += 1 zh_item = ZhihuItem() zh_item['title'] = title print(zh_item) yield zh_item print('获得', count, '个数据')
def parse(self, response): self.__log.info('parse') html_doc = response.body soup = BeautifulSoup(html_doc, 'lxml') filename = 'output/zhihu.question' with open(filename, 'wb') as f: f.write(response.body) item = ZhihuItem() item['title'] = soup.title.string item['question'] = soup.find_all('a', class_='question_link') item['corp'] = soup.find_all('span', class_='corp') return item
def get_question_title(self, response): line = response.body.replace("\\\"", "\\\\\\\"") title = json.loads(line).get('title') zhihuItem = ZhihuItem() zhihuItem['KeyWord'] = response.meta['key_word'] zhihuItem['Topic_name'] = response.meta['Topic_name'] zhihuItem['Topic_id'] = response.meta['topic_id'] zhihuItem['Question_id'] = response.meta['Question_id'] zhihuItem['Question_content'] = title zhihuItem['Content'] = response.meta['content'] yield zhihuItem
def parse(self, response): #see the page testing if loggedin by using cookies above #with open("zhihu.com", "wb") as f: #f.write(response.body) #deserialize the json response response_json = json.loads(response.text) if not response_json["paging"]["is_end"]: yield scrapy.http.Request(response_json["paging"]["next"], callback=self.parse, headers=self.headers, cookies=self.cookies) if response_json["data"]: for data in response_json["data"]: if data["url_token"]: yield scrapy.http.Request(url_to_crawl.format( data["url_token"]), callback=self.parse, headers=self.headers, cookies=self.cookies) zhihu_itemloader = ItemLoader(item=ZhihuItem(), response=response) zhihu_itemloader.add_value("is_followed", data["is_followed"]) zhihu_itemloader.add_value("avatar_url_template", data["avatar_url_template"]) zhihu_itemloader.add_value("user_type", data["user_type"]) zhihu_itemloader.add_value("answer_count", data["answer_count"]) zhihu_itemloader.add_value("is_following", data["is_following"]) zhihu_itemloader.add_value("url", data["url"]) zhihu_itemloader.add_value("url_token", data["url_token"]) zhihu_itemloader.add_value("id_", data["id"]) zhihu_itemloader.add_value("articles_count", data["articles_count"]) zhihu_itemloader.add_value("name", data["name"]) zhihu_itemloader.add_value("headline", data["headline"]) zhihu_itemloader.add_value("type_", data["type"]) zhihu_itemloader.add_value("is_advertiser", data["is_advertiser"]) zhihu_itemloader.add_value("avatar_url", data["avatar_url"]) zhihu_itemloader.add_value("is_org", data["is_org"]) zhihu_itemloader.add_value("gender", data["gender"]) zhihu_itemloader.add_value("follower_count", data["follower_count"]) zhihu_itemloader.add_value("badge", data["badge"]) zhihu_itemloader.add_value( "crawl_time", datetime.now().strftime("%Y-%m-%d %H:%M:%S")) yield zhihu_itemloader.load_item()
def parse(self,response): results=json.loads(response.text) #print(response.text) item=ZhihuItem() for field in item.fields: if field in results.keys(): item[field] = results.get(field) yield item yield scrapy.Request(self.followees_url.format(results.get('url_token'),self.followees_include), self.parse_followees) yield scrapy.Request(self.followers_url.format(results.get('url_token'), self.followers_include), self.parse_followers)
def parse_user(self, response): result = json.loads(response.text) item = ZhihuItem() for field in item.fields: if field in result.keys(): item[field] = result.get(field) yield item yield scrapy.Request(url=self.follow_url.format( user=result.get('url_token'), include=self.follow_include, offset=0, limit=20), callback=self.parse_follows)
def parse(self, response): # print(response.body.decode("utf-8", "ignore")) print(response.body.decode("utf-8", "ignore")) response_data = json.loads(response.body.decode("utf-8", "ignore"))["data"] # print(len(response_data)) # print('**'*20) count = len(response_data) if count < 20: pass else: page_offset = int(re.findall("&offset=(.*?)&", response.url)[0]) new_page_offset = page_offset + 20 next_page_url = response.url.replace( "&offset=" + str(page_offset) + "&", "&offset=" + str(new_page_offset) + "&") yield scrapy.Request(url=next_page_url, callback=self.parse) for eve_user in response_data: item = ZhihuItem() item["name"] = eve_user["name"] item["is_advertiser"] = eve_user["is_advertiser"] item["avatar_url_template"] = eve_user["avatar_url_template"] item["user_type"] = eve_user["user_type"] item["answer_count"] = eve_user["answer_count"] item["type"] = eve_user["type"] item["url_token"] = eve_user["url_token"] item["user_id"] = eve_user["id"] item["articles_count"] = eve_user["articles_count"] item["url"] = eve_user["url"] item["gender"] = eve_user["gender"] item["headline"] = eve_user["headline"] item["avatar_url"] = eve_user["avatar_url"] item["is_org"] = eve_user["is_org"] item["follower_count"] = eve_user["follower_count"] #去重 with open("userinfor.txt") as f: user_list = f.read() if eve_user["url_token"] not in user_list: with open("userinfor.txt", "a") as f: f.writable() f.write(eve_user["url_token"] + "----") yield item new_url = "https://www.zhihu.com/api/v4/members/" + eve_user[ "url_token"] + "/followers?include=data[*].answer_count,articles_count,gender,follower_count,is_followed,is_following,badge[?(type=best_answerer)].topics&offset=20&limit=20" yield scrapy.Request(url=new_url, callback=self.parse)
def get_user_info(self, response): #获取用户信息信息 data = json.loads(response.text) #print(data) item = ZhihuItem() for Field in item.fields: #可以获取在item里面定义的key值,就是那些locations,employments等 #print(Field) if Field in data.keys(): if Field == 'educations': try: item[Field] = ','.join([ data.get(Field)[i].get('school').get('name') for i in range(len(data.get(Field))) ]) except: item[Field] = '' elif Field == 'business': try: item[Field] = data.get(Field).get('name') except: item[Field] = '' elif Field == 'employments': try: item[Field] = ','.join([ data.get(Field)[i].get('job').get('name') for i in range(len(data.get(Field))) ]) except: item[Field] = '' elif Field == 'locations': try: item[Field] = data.get(Field)[0].get('name') except: item[Field] = '' else: item[Field] = data.get(Field) # 获取字典里面的值 yield item yield scrapy.Request(url=self.followers_url.format( user_name=data.get('url_token'), include_follow=self.include_follow, offset=0, limit=20), callback=self.get_followers_parse) yield scrapy.Request(url=self.followees_url.format( user_name=data.get('url_token'), include_follow=self.include_follow, offset=0, limit=20), callback=self.get_followees_parse)
def parse_homepage(self, response): divs = response.xpath( '//div[starts-with(@class,"feed-item folding feed-item-hook feed-item")]' ) for div in divs: items = ZhihuItem() label = div.xpath( 'div[@class ="feed-main"]/div[@class= "feed-source"]/a/text()' ).extract() print label #items['label'] = label[0] if label else '' #items['title'] = div.xpath('./div/div/div/h2[@class ="feed-title"]/a/text()').extract()[0] #item['author'] = div.xpath('./div/div/div/div/div/span/span[@class = "author-link"]/a/text()').extract()[0] yield items
def parse_user(self, response): #print(response.text) result = json.loads(response.text) item = ZhihuItem() for field in item.fields: if field in result.keys(): item[field] = result.get(field) yield item yield Request( self.follows_url.format(user=result.get('url_token'), include=self.follows_query, limit=20, offset=0), self.parse_follows)
def parse(self, response): item = ZhihuItem() print(response) item["title"] = response.xpath( "//meta[@itemprop='name']/@content").extract() item["content"] = response.xpath( "//span[@itemprop='text']/text()").extract() item["link"] = response.xpath( "//a[@data-za-detail-view-id='3942']/@href").extract() #item["like"] = response.xpath("//button[@class='Button VoteButton VoteButton--up']/text()").extract() #item["dislike"] = response.xpath("//button[@class='Button VoteButton VoteButton--down']/text()/").extract() #item["comment"] = response.xpath("//button[@]class='Button ContentItem-action Button--plain Button--withIcon Button--withLabe']/text()").extract() print(item['title'], item['content']) #yield item '''
def parse(self, response): item = ZhihuItem() self.xsrf = response.xpath( './/input[@name="_xsrf"]/@value').extract()[0] self.i = 0 pages = response.xpath('.//div[@class="feed-content"]') for page in pages: item['title'] = page.xpath( './/h2[@class="feed-title"]/a/text()').extract() item['content'] = page.xpath( './/div[@class="zh-summary summary clearfix"]/text()').extract( ) for j in range(50): yield self.next10() time.sleep(0.5)
def parse(self, response): sel = scrapy.selector.Selector(response) # names = sel.xpath('//div/div/div/div/ul/li/div/div/p[@class="p1"]/a/text()').extract() # zans = sel.xpath('//div/div/div/div/ul/li/div/div/p/span/text()').extract() # titles = sel.xpath('//div/div/div/div/ul/li/p/a/text()').extract() sites = sel.xpath('//div/div/div/div/ul[@class="news-list"]/li') items = [] for site in sites: item = ZhihuItem() item['name'] = site.xpath( 'div/div/p[@class="p1"]/a/text()').extract() item['zan'] = site.xpath('div/div/p/span/text()').extract() item['title'] = site.xpath('p/a/text()').extract() items.append(item) return items
def parse_user(self, request): item = ZhihuItem() results = json.loads(request.text) for field in item.fields: if field in results.keys(): item[field] = results.get(field) yield item yield scrapy.Request( self.followings_url % (results.get('url_token'), self.followings_include), callback=self.parse_followings) yield scrapy.Request( self.followers_url % (results.get('url_token'), self.followers_include), callback=self.parse_followers)
def parse_user(self, response): result = json.loads(response.text) item = ZhihuItem() result_keys = result.get('data')[0].get('actor').keys() result_key = result.get('data')[0].get('actor') for field in item.fields: if field in result_keys: item[field] = result_key.get(field) yield item #yield Request(self.user_url.format(user=result.get('url_token'),include=self.user_query,offset=0,limit=20),meta={'proxy':'47.112.222.108:8000'},callback=self.parse) yield Request(self.user_url.format(user=result.get('url_token'), include=self.user_query, offset=0, limit=20), callback=self.parse)
def parse_user(self, response): result = json.loads(response.text) item = ZhihuItem() for field in item.fields: if field in result.keys(): item[field] = result.get(field) yield item # 递归关注 yield scrapy.Request( self.follows_url.format(user_token=result.get('url_token')), callback=self.parse_follows, meta={'cookiejar': response.meta['cookiejar']})
def parse_user(self, response): # print(response.text) result = json.loads(response.text) item = ZhihuItem() for field in item.fields: if field in result.keys(): item[field] = result.get(field) yield item #获取关注列表的关注列表 yield scrapy.Request(url=self.followee_url.format( user=result.get("url_token"), include=self.followee_query, offset=0, limit=20), callback=self.parse_followee)
def parse(self, response): # print response.meta["cookies"] for each in Selector(response).xpath( r'//div[@class="Card AnswerCard"] | //div[@class="Card MoreAnswers"]' ): item = ZhihuItem() item["authorName"] = \ Selector(each).xpath(r'.//span[@class="UserLink AuthorInfo-name"]//a/text()').extract()[0] item["authorUrl"] = \ Selector(each).xpath(r'.//div[@class="ContentItem-meta"]//meta[@itemprop="url"]/@content').extract()[0] item["url"] = response.url yield item print '11111'
def parse_item(self, response): # print response.body selector = Selector(response) # print 'Start extract data from response ..............' items = [] for elem in selector.xpath( '//div[@class="feed-content"]/h2[@class="feed-title"]'): item = ZhihuItem() item['title'] = elem.xpath('a/text()').extract() item['link'] = elem.xpath('a/@href').extract() items.append(item) print(item['title'].decode()) print(item['link'].decode()) # print 'Finish extract data........................' return items
def parse_user(self, response): print(response.text) result = json.loads(response.text) #print(result) item = ZhihuItem() print('输出结果') for field in item.fields: if field in result.keys(): item[field] = result.get(field) yield (item) yield Request(self.follow_url.format(user=result.get('url_token'), include=self.follow_query, offset=0), callback=self.parse_follow) yield Request(self.follow_url.format(user=result.get('url_token'), include=self.follow_query, offset=0), callback=self.parse_follower)