def jsonparse(self, response): content = json.loads(response.body) city_name = response.meta['city_name'] district_name = response.meta['district_name'] page = response.meta['page'] city_id = response.meta['city_id'] district_id = response.meta['district_id'] shop_list = content['shopRecordBeanList'] for info in shop_list: items = DianpingItem() items['city_name'] = city_name items['district_name'] = district_name items['shop_id'] = info['shopId'] items['brand_name'] = info['shopRecordBean']['shopName'] items['shop_name'] = info['shopRecordBean']['shopTotalName'] items['shop_address'] = info['address'] yield items pagecount = content['pageCount'] if page < int(pagecount): page += 1 url = 'http://www.dianping.com/search/map/ajax/json?cityId=' + str(city_id) + \ '&categoryId=182®ionId=' + str(district_id) + '&page=' + str(page) yield scrapy.Request(url, headers=headers, meta={ 'city_name': city_name, 'district_name': district_name, 'page': page, 'city_id': city_id, 'district_id': district_id }, callback=self.jsonparse, dont_filter=True)
def parseShop(self, response): shop_info = re.findall('window.shop_config=(.*?)</script>', response.text, re.S) shop_info = json.dumps(shop_info[0]) shop_info = json.loads(shop_info) # title=response.xpath('//*[@id="basic-info"]/h1/text()').extract() # address=response.xpath('//*[@id="address"]/text()').extract() star = response.xpath( '//*[@id="basic-info"]/div[1]/span[1]/@title').extract() reviewCount = response.xpath('//*[@id="reviewCount"]/text()').extract() price = response.xpath('//*[@id="avgPriceTitle"]/text()').extract() flavor = response.xpath( '//*[@id="comment_score"]/span[1]/text()').extract() environment = response.xpath( '//*[@id="comment_score"]/span[2]/text()').extract() service = response.xpath( '//*[@id="comment_score"]/span[3]/text()').extract() comm = response.xpath( '//*[@id="summaryfilter-wrapper"]/div/label/span/text()').extract( ) print(star, reviewCount, price, flavor, environment, service, comm) item = DianpingItem() item['shop_info'] = shop_info return item
def parseDetail(self, response): jsonResponse = json.loads(response.body.decode(response.encoding)) data = jsonResponse['data'] item = DianpingItem() item['id'] = data['detail']['offlineActivityId'] item['title'] = data['detail']['title'] item['cost'] = data['detail']['cost'] item['shopAddress'] = data['detail']['activityShopInfoList'][0][ 'shopAddress'] item['distanceInfo'] = data['detail']['activityShopInfoList'][0][ 'distanceInfo'] item['distance'] = data['detail']['activityShopInfoList'][0][ 'distance'] item['score'] = data['detail']['activityShopInfoList'][0]['shopPower'] item['shopName'] = data['detail']['activityShopInfoList'][0][ 'shopName'] item['shopType'] = data['detail']['activityShopInfoList'][0][ 'shopType'] if len(data['detail']['offlineActivityTagDTOList']) > 0: item['tagId'] = data['detail']['offlineActivityTagDTOList'][0][ 'tagId'] item['tagName'] = data['detail']['offlineActivityTagDTOList'][0][ 'tagName'] else: item['tagId'] = 0 item['tagName'] = '' item['like'] = '' item['apply_result'] = '' yield item
def parse(self, response): item = DianpingItem() item['shop_name'] = response.xpath( '//*[@id="shop-all-list"]/ul/li/div[2]/div[1]/a/h4/text()' ).extract() item['shop_city'] = response.xpath( '//*[@id="page-header"]/div[1]/a[2]/text()').extract() item['shop_address_1'] = response.xpath( '//*[@id="shop-all-list"]/ul/li/div[2]/div[3]/a[2]/span/text()' ).extract() item['shop_address_2'] = response.xpath( '//*[@id="shop-all-list"]/ul/li/div[2]/div[3]/span/text()' ).extract() #shop_tel # item['shop_star'] = (response.xpath('//*[@id="shop-all-list"]/ul/li/div[2]/div[2]/span/'))[0].attrib.get('title') item['com_num'] = response.xpath( '//*[@id="shop-all-list"]/ul/li/div[2]/div[2]/a[1]/b/text()' ).extract() item['price_avg'] = response.xpath( '//*[@id="shop-all-list"]/ul/li/div[2]/div[2]/a[2]/b/text()' ).extract() item['tag_name'] = response.xpath( '//*[@id="shop-all-list"]/ul/li/div[2]/div[3]/a[1]/span/text()' ).extract() item['kou_wei'] = response.xpath( '//*[@id="shop-all-list"]/ul/li/div[2]/span/span[1]/b/text()' ).extract() item['huan_jing'] = response.xpath( '//*[@id="shop-all-list"]/ul/li/div[2]/span/span[2]/b/text()' ).extract() item['fu_wu'] = response.xpath( '//*[@id="shop-all-list"]/ul/li/div[2]/span/span[3]/b/text()' ).extract() return item
def parse_detail_de(self, response): name_ser = response.meta['name_ser'] url = response.meta['url'] print('++++++++++++++++++') print(url) print(response.url) # print(response.body) if 'https://verify.meituan.com/v2/web/general_page?' in response.url: html = Crack_verification_code.Crack(response.url, url) else: html = response.body # print(response.url.split('&')[1].replace('requestCode=','')) # print(html) item = DianpingItem() soup = BeautifulSoup(str(html), 'lxml') shop_name = soup.find('div', class_="shop-name").find('h1').get_text() rank_level = soup.find('div', class_="rank").find('span').get('class') rank = soup.find('div', class_="rank").find_all('span', class_="item") phone = soup.find('div', class_="phone").find_all('span', class_="item J-phone-hide") address = soup.find('div', class_="address").get_text().replace( '地址:', '').replace(' ', '').replace('\n', '') for i in rank: if '效果' in i.get_text(): item['effection'] = i.get_text().replace('效果:', '') if '师资' in i.get_text(): item['teachers'] = i.get_text().replace('师资:', '') if '环境' in i.get_text(): item['environment'] = i.get_text().replace('环境:', '') item['star'] = str( int(rank_level[1].replace('mid-str', '')) / 10) + '星' item['name'] = shop_name phone_list = [] for i in phone: phone_list.append(i.get('data-phone')) item['address'] = address item['phone'] = phone_list class_shop = soup.find_all('div', class_="item notag") class_list = [] for i in class_shop: class_dict = {} class_dict['class_name'] = i.find('p', class_="title").get_text() class_dict['class_price'] = i.find('div', class_="price").find( 'span', class_="cur").get_text().replace('\n', '').replace(' ', '') shop_info = soup.find('div', id="info").find('ul', class_="con").find_all('li') for i in shop_info: if i.find("span", class_="title").get_text() == "商户介绍": item['description'] = i.get_text().replace('\r\n', '').replace( ' ', '').replace('\n', '') if i.find("span", class_="title").get_text() == "特色服务": character = i.get_text().replace(' ', '').split('\n') item['characteristic'] = [i for i in character if i != ''][1:] item['_id'] = self.hash_distanct(item['name'], item['address']) yield item
def parse_coord(self, response): """获取经纬度数据""" item = DianpingItem() item.update(response.meta['detail']) # print(response.url) item['tele'] = response.css( 'p.expand-info.tel::text').extract()[1].strip() try: coord_detail = [ x for x in response.css('script').extract() if 'window.shop' in x ][0] lat = re.findall('(?<=shopGlat: ").*?(?=",)', coord_detail)[0] lng = re.findall('(?<=shopGlng:").*?(?=",)', coord_detail)[0] coord = lat + ',' + lng except: coord = '' item['coord'] = coord yield item
def parse_dir_contents(self, response): item = DianpingItem() item['shop_id'] = re.search('shopId=(\d+)', response.body).group(1) item['shop_name'] = response.xpath( '//*[@id="basic-info"]/h1/text()').extract()[0].strip() item['good_summary'] = [] item['last_updated'] = strftime("%Y-%m-%d %H:%M:%S") for sel in response.xpath('//span[@class="good J-summary"]'): item['good_summary'].append(sel.xpath('a/text()').extract()[0]) yield item
def parse_page(self, response): item = DianpingItem() print(response.url) for each in response.xpath( "//div[@class='reviews-items']/ul/li/div[@class='main-review']" ): username = each.xpath( "./div[@class='dper-info']/a[@class='name']/text()").extract( )[0].replace('\n', '').replace(' ', '') taste = each.xpath( "./div[@class='review-rank']/span[@class='score']/span[@class='item'][1]/text()" ).extract()[0].replace('\n', '').replace(' ', '') environment = each.xpath( "./div[@class='review-rank']/span[@class='score']/span[@class='item'][2]/text()" ).extract()[0].replace('\n', '').replace(' ', '') service = each.xpath( "./div[@class='review-rank']/span[@class='score']/span[@class='item'][3]/text()" ).extract()[0].replace('\n', '').replace(' ', '') pre_data = each.xpath( "./div[@class='review-rank']/span[@class='score']/span[@class='item'][4]/text()" ).extract() if len(pre_data) > 0: pre = pre_data[0].replace('\n', '').replace(' ', '') else: pre = 'Null' comment_data = each.xpath( "./div[@class='review-truncated-words']/text()").extract() if len(comment_data) > 0: comment = comment_data[0].replace('\n', '').replace( '\t', '').replace(' ', '') else: comment = 'Null' cre_time = each.xpath( "./div[@class='misc-info clearfix']/span[@class='time']/text()" ).extract()[0].replace('\n', '') star = each.xpath("./div[@class='review-rank']/span[1]/@class" ).extract()[0].replace( 'sml-rank-stars', '').replace('sml-str', '').replace( 'star', '').replace(' ', '').replace('0', '') item['username'] = username item['taste'] = taste item['environment'] = environment item['service'] = service item['pre'] = pre item['comment'] = comment item['cre_time'] = cre_time item['star'] = star print(item) yield item
def parse_info(self, response): hxs = HtmlXPathSelector(response) sites = hxs.select("//dd[child::ul[@class='remark']]") for site in sites: item = DianpingItem() item['name'] = site.select("descendant::li[@class='shopname']/a/text()").extract() shoplink = site.select("descendant::li[@class='shopname']/a[1]/@href").extract() shoplink = shoplink[0] shopID = re.search("shopId=(\d+)#", shoplink).groups()[0] item['tag'] = site.select("descendant::li[@class='tags']/descendant::text()").extract() item['avgPrice'] = site.select("descendant::strong[@class='average']/text()").extract() item['stars'] = site.select("descendant::span[contains(@class,'item-rank-rst')]/@title").extract() self.items_buffer[shopID] = item log.msg("ken: yield link:%s"%self.base_url+shoplink) yield Request(url=self.base_url+shoplink, callback=self.parse_details)
def parse(self, response): # if (('Location' in response.headers) and (response.headers['Location'] != response.request.url)): # yield scrapy.Request(url=response.headers['Location'], callback=self.parse) # return # else: # from scrapy.shell import inspect_response # inspect_response(response, self) # print sys.stdout.encoding # print sys.getdefaultencoding() for sel in response.css('p.desc.J-desc'): item = DianpingItem() # item['titleCSSSelector'] = sel.css('a').select('@href').extract_first() # item['title'] = sel.xpath('./text()').extract_first() item['title'] = ''.join(sel.xpath('./text()').extract()) item['link'] = '' # sel.xpath('a/@href').extract_first() item['desc'] = '' # sel.xpath('text()').extract_first() yield item
def parseResponse(self, response, N): selector = Selector(response) infos = selector.xpath('//li/div[2]') for info in infos: item = DianpingItem() rank = info.xpath('div[1]/span/@title').extract() # time = info.xpath('div[3]/span/text()').extract() desc = info.xpath('div[2]/div/text()').extract() airline = info.xpath('div[3]/h2/text()').extract() item['estar'] = rank # item['time'] = time item['desc'] = replace_escape_chars(remove_tags(desc[0]), which_ones=('\n', '\t', '\r',' ')) item['airline'] = airline yield item
def parse_dianping(self, response): item = DianpingItem() item['shop_name'] = response.xpath( '//h1[@class="shop-name"]//text()').extract()[0].strip() item['shop_address'] = response.xpath( '//div[@class="expand-info address"]//' 'span[@itemprop="street-address"]/@title').extract()[0] lng_atr = response.xpath('//div[@id="aside"]/script/text()')\ .re(r"lng:(\d*.\d*),lat:(\d*.\d*)") try: item['shop_longitude'], item['shop_latitude'] = lng_atr except ValueError as error: item['shop_longitude'], item['shop_latitude'] = 0, 0 print "There is no longitude nor latitude of the shop!" item['shop_city'] = response.xpath( '//a[@class="city J-city"]//text()').extract()[0].strip() item['shop_region'] = response.xpath( '//span[@itemprop="locality region"]//text()').extract()[0].strip( ) self.shops_count += 1 print "%d shops are crawled." % self.shops_count yield item
def parseDoApply(self, response): jsonResponse = json.loads(response.body.decode(response.encoding)) data = jsonResponse['data'] activityInfo = response.meta['activityInfo'] item = DianpingItem() item['id'] = activityInfo['id'] item['title'] = activityInfo['title'] item['cost'] = activityInfo['cost'] item['shopAddress'] = activityInfo['shopAddress'] item['distanceInfo'] = activityInfo['distanceInfo'] item['distance'] = activityInfo['distance'] item['score'] = activityInfo['score'] item['shopName'] = activityInfo['shopName'] item['shopType'] = activityInfo['shopType'] item['tagId'] = activityInfo['tagId'] item['tagName'] = activityInfo['tagName'] item['like'] = activityInfo['like'] item['apply_result'] = data['desc'] yield item
def parse_comment(self, response): item = DianpingItem() categoryLevelA_range = ['面包甜点', '自助餐', '咖啡厅', '西餐', '台湾菜', '贵州菜', '江西菜', '东南亚菜', '其他', '俄罗斯菜', '新疆菜', '粤菜', '素菜', '日本料理', '日本菜', '云贵菜', '小吃快餐', '家常菜', '私房菜', '串串香', '本帮江浙菜', '江浙菜', '苏州江浙', '烧烤', '烤鱼', '鲁菜', '客家菜', '南京/江浙菜', '蟹宴', '茶馆', '创意菜', '面馆', '酒吧', '北京菜', '快餐简餐', '小吃', '海鲜', '火锅', '湘菜', '川菜', '兔头/兔丁', '西北菜', '粥粉面', '云南菜', '粤菜/潮州菜', '东北菜', '农家菜', '小龙虾', '大闸蟹', '粉面馆', '湖北菜', '杭帮/江浙菜', '茶餐厅', '徽菜', '闽菜', '韩国料理'] customers = response.xpath('//div[@class = "comment-list"]/ul/li') crumb = response.xpath('//div[@class = "crumb"]//li') if len(crumb) == 7: areaName = crumb[2].xpath('strong//span/text()').extract()[0] categoryLevelA = crumb[3].xpath('strong//span/text()').extract()[0] categoryLevelB = crumb[4].xpath('strong//span/text()').extract()[0] elif len(crumb) == 6: categoryLevelA = crumb[-3].xpath('strong//span/text()').extract()[0] if categoryLevelA in categoryLevelA_range: categoryLevelA = categoryLevelA categoryLevelB = categoryLevelA areaName = crumb[-4].xpath('strong//span/text()').extract()[0] else: categoryLevelB = categoryLevelA categoryLevelA = crumb[-4].xpath('strong//span/text()').extract()[0] areaName = crumb[-5].xpath('strong//span/text()').extract()[0] else: areaName = crumb[1].xpath('strong//span/text()').extract()[0] categoryLevelA = crumb[-3].xpath('strong//span/text()').extract()[0] categoryLevelB = categoryLevelA if len(customers) == 0: item['distinctName'] = response.meta['distinctName'] item['cityName'] = response.meta['cityName'] item['areaName'] = areaName item['categoryLevelA'] = categoryLevelA item['categoryLevelB'] = categoryLevelB item['restaurantName'] = response.meta['restaurantName'] item['restaurantStar'] = response.meta['restaurantStar'] item['scoreOfTaste'] = response.meta['scoreOfTaste'] item['scoreOfEnvironment'] = response.meta['scoreOfEnvironment'] item['scoreOfService'] = response.meta['scoreOfService'] item['averageCost'] = response.meta['averageCost'] item['restaurantAddress'] = response.meta['restaurantAddress'] item['restaurantTel'] = response.meta['restaurantTel'] item['commentCount'] = response.meta['commentCount'] item['commentSum'] = response.meta['commentSum'] item['rankTotal_5_Count'] = '' item['rankTotal_4_Count'] = '' item['rankTotal_3_Count'] = '' item['rankTotal_2_Count'] = '' item['rankTotal_1_Count'] = '' item['customerName'] = '' item['customerLevel'] = '' item['customerVIP'] = '' item['commRankTotal'] = '' item['commRankTaste'] = '' item['commRankEnvironment'] = '' item['commRankService'] = '' item['commCostPer'] = '' item['commentContent'] = '' item['commentDate'] = '' item['commentLiked'] = '' yield item else: rankTotal_5_Count = response.xpath('//div[@class = "comment-star"]/dl/dd[2]//text()').extract()[1] rankTotal_5_Count = int(rankTotal_5_Count.replace('(','').replace(')','')) rankTotal_4_Count = response.xpath('//div[@class = "comment-star"]/dl/dd[3]//text()').extract()[1] rankTotal_4_Count = int(rankTotal_4_Count.replace('(','').replace(')','')) rankTotal_3_Count = response.xpath('//div[@class = "comment-star"]/dl/dd[4]//text()').extract()[1] rankTotal_3_Count = int(rankTotal_3_Count.replace('(','').replace(')','')) rankTotal_2_Count = response.xpath('//div[@class = "comment-star"]/dl/dd[5]//text()').extract()[1] rankTotal_2_Count = int(rankTotal_2_Count.replace('(','').replace(')','')) rankTotal_1_Count = response.xpath('//div[@class = "comment-star"]/dl/dd[6]//text()').extract()[1] rankTotal_1_Count = int(rankTotal_1_Count.replace('(','').replace(')','')) for customer in customers: distinctName = response.meta['distinctName'] cityName = response.meta['cityName'] areaName = areaName categoryLevelA = categoryLevelA categoryLevelB = categoryLevelB restaurantName = response.meta['restaurantName'] restaurantStar = response.meta['restaurantStar'] scoreOfTaste = response.meta['scoreOfTaste'] scoreOfEnvironment = response.meta['scoreOfEnvironment'] scoreOfService = response.meta['scoreOfService'] averageCost = response.meta['averageCost'] restaurantAddress = response.meta['restaurantAddress'] restaurantTel = response.meta['restaurantTel'] commentCount = response.meta['commentCount'] commentSum = response.meta['commentSum'] customerName = customer.xpath('div[1]//p[@class = "name"]/a/text()').extract()[0] customerLevel = customer.xpath('div[1]//p[@class = "contribution"]/span/@title').extract()[0] if customerLevel == '': customerLevel = 1 elif '200' in customerLevel: customerLevel = 2 elif '400' in customerLevel: customerLevel = 3 elif '1000' in customerLevel: customerLevel = 4 elif '2000' in customerLevel: customerLevel = 5 elif '5000' in customerLevel: customerLevel = 6 else: customerLevel = '' customerVIP = customer.xpath('div[1]//i[@class = "icon-vip"]').extract() if len(customerVIP) != 0: customerVIP = 1 else: customerVIP =0 try: commRankTotal = customer.xpath('div[2]//div[@class = "user-info"]/span[1]/@class').extract()[0] commRankTotal = int(commRankTotal[-2]) except: commRankTotal = '' rankList = customer.xpath('div[2]//div[@class = "comment-rst"]/span/text()').extract() try: commRankTaste = int(rankList[0][-1]) commRankEnvironment = int(rankList[1][-1]) commRankService = int(rankList[2][-1]) except: commRankTaste = '' commRankEnvironment = '' commRankService = '' try: commCostPer = customer.xpath('div[2]//div[@class = "user-info"]/span[2]/text()').extract()[0] commCostPer = int(re.match(r'.*?(\d+)', commCostPer).group(1)) except: commCostPer = '' commentContent = ('\n'.join(customer.xpath('div[2]//div[@class = "J_brief-cont"]//text()').extract())).strip() commentDate = customer.xpath('div[2]//div[@class = "misc-info"]/span[@class = "time"]/text()').extract()[0] commentLiked = customer.xpath('div[2]//span[@class = "col-right"]/span[1]/a/span/text()').extract() try: commentLiked = commentLiked[1].replace('(','').replace(')','') commentLiked = int(commentLiked) except: commentLiked = '' item['cityName'] = cityName item['distinctName'] = distinctName item['areaName'] = areaName item['categoryLevelA'] = categoryLevelA item['categoryLevelB'] = categoryLevelB item['restaurantName'] = restaurantName item['restaurantStar'] = restaurantStar item['scoreOfTaste'] = scoreOfTaste item['scoreOfEnvironment'] = scoreOfEnvironment item['scoreOfService'] = scoreOfService item['averageCost'] = averageCost item['restaurantAddress'] = restaurantAddress item['restaurantTel'] = restaurantTel item['commentCount'] = commentCount item['commentSum'] = commentSum item['rankTotal_5_Count'] = rankTotal_5_Count item['rankTotal_4_Count'] = rankTotal_4_Count item['rankTotal_3_Count'] = rankTotal_3_Count item['rankTotal_2_Count'] = rankTotal_2_Count item['rankTotal_1_Count'] = rankTotal_1_Count item['customerName'] = customerName item['customerLevel'] = customerLevel item['customerVIP'] = customerVIP item['commRankTotal'] = commRankTotal item['commRankTaste'] = commRankTaste item['commRankEnvironment'] = commRankEnvironment item['commRankService'] = commRankService item['commCostPer'] = commCostPer item['commentContent'] = commentContent item['commentDate'] = commentDate item['commentLiked']= commentLiked yield item try: origin_url = re.match(r'(http:.*?more)', response.url).group(1) #后面重新调用此函数,response.url会改变,需要re去掉?参数,变回最初的url nextlink = origin_url + response.xpath('//div[@class = "Pages"]/a[last()]/@href').extract()[0] yield Request(nextlink, meta = { 'distinctName': distinctName, 'cityName': cityName, #'areaName': areaName, #'categoryLevelA': categoryLevelA, #'categoryLevelB': categoryLevelB, 'restaurantName': restaurantName, 'restaurantStar': restaurantStar, 'scoreOfTaste': scoreOfTaste, 'scoreOfEnvironment': scoreOfEnvironment, 'scoreOfService': scoreOfService, 'averageCost': averageCost, 'restaurantAddress': restaurantAddress, 'restaurantTel': restaurantTel, 'commentCount': commentCount, 'commentSum': commentSum}, callback = self.parse_comment) except: pass
def parseDetail(self, response): jsonResponse = json.loads(response.body.decode(response.encoding)) data = jsonResponse['data'] if len(data['detail']['offlineActivityTagDTOList']) > 0: tagId = data['detail']['offlineActivityTagDTOList'][0]['tagId'] else: tagId = 0 features = { 'cost': data['detail']['cost'], 'distance': data['detail']['activityShopInfoList'][0]['distance'], 'score': data['detail']['activityShopInfoList'][0]['shopPower'], 'tagId': tagId } activityInfo = { 'id': data['detail']['offlineActivityId'], 'title': data['detail']['title'], 'cost': data['detail']['cost'], 'shopAddress': data['detail']['activityShopInfoList'][0]['shopAddress'], 'distanceInfo': data['detail']['activityShopInfoList'][0]['distanceInfo'], 'distance': data['detail']['activityShopInfoList'][0]['distance'], 'score': data['detail']['activityShopInfoList'][0]['shopPower'], 'shopId': data['detail']['activityShopInfoList'][0]['shopId'], 'shopName': data['detail']['activityShopInfoList'][0]['shopName'], 'shopType': data['detail']['activityShopInfoList'][0]['shopType'], } if len(data['detail']['offlineActivityTagDTOList']) > 0: activityInfo['tagId'] = data['detail'][ 'offlineActivityTagDTOList'][0]['tagId'] activityInfo['tagName'] = data['detail'][ 'offlineActivityTagDTOList'][0]['tagName'] else: activityInfo['tagId'] = 0 activityInfo['tagName'] = '' like = predict(features) applyed = response.meta['applyed'] activityInfo['like'] = like if like == 1: if applyed: print('applyed ' + activityInfo['shopName']) item = DianpingItem() item['id'] = activityInfo['id'] item['title'] = activityInfo['title'] item['cost'] = activityInfo['cost'] item['shopAddress'] = activityInfo['shopAddress'] item['distanceInfo'] = activityInfo['distanceInfo'] item['distance'] = activityInfo['distance'] item['score'] = activityInfo['score'] item['shopName'] = activityInfo['shopName'] item['shopType'] = activityInfo['shopType'] item['tagId'] = activityInfo['tagId'] item['tagName'] = activityInfo['tagName'] item['like'] = activityInfo['like'] item['apply_result'] = '成功' yield item else: yield self.requestGetPreApply(activityInfo)
def parse(self, response): item = DianpingItem() sel = Selector(response) sites = sel.xpath('//div[@id="shop-all-list"]/ul/li') for site in sites: title = site.xpath('div[2]/div[1]/a[1]/h4/text()').extract() item['shopname'] = title[0] print title[0] link = site.xpath('div[2]/div[1]/a[1]/@href').extract() item['shopurl'] = 'http://www.dianping.com' + str(link[0]) print 'http://www.dianping.com' + str(link[0]) shoplevels = site.xpath('div[2]/div[2]/span/@title').extract() item['shoplevel'] = shoplevels[0] reviewnums = site.xpath('div[2]/div[2]/a[1]/b/text()').extract() if len(reviewnums) > 0: item['reviewnum'] = reviewnums[0] else: item['reviewnum'] = '0' avgcost = site.xpath('div[2]/div[2]/a[2]/b/text()').extract() if len(avgcost) > 0: #print avgcost[0] #print avgcost[0].lstrip('¥') #print int(avgcost[0].lstrip('¥')) #item['avgcost'] = avgcost[0] item['avgcost'] = int(avgcost[0].lstrip('¥')) else: item['avgcost'] = '0' tastes = site.xpath('div[2]/span/span[1]/b/text()').extract() if len(tastes) > 0: item['taste'] = tastes[0] else: item['taste'] = '0' envs = site.xpath('div[2]/span/span[2]/b/text()').extract() if len(envs) > 0: item['env'] = envs[0] else: item['env'] = '0' services = site.xpath('div[2]/span/span[3]/b/text()').extract() if len(services) > 0: item['service'] = services[0] else: item['service'] = '0' foodtypes = site.xpath('div[2]/div[3]/a[1]/span/text()').extract() item['foodtype'] = foodtypes[0] location = site.xpath('div[2]/div[3]/a[2]/span/text()').extract() item['location'] = location[0] yield item nextLink = site.xpath( '//div[@class="page"]/a[last()]/@data-ga-page').extract() print '++++++++++++++++++++++++++++++++++++++++++++++' print nextLink if nextLink: print nextLink[0] nextLink = 'http://www.dianping.com/search/category/12/10/o3p' + nextLink[ 0] #reallink = str(response.url) print nextLink #reallink = nextLink yield Request(nextLink, headers=self.headers)