def parse_list(self, response): item = DianpingItem() selector = Selector(response) div = selector.xpath('//div[@id="shop-all-list"]/ul/li') for dd in div: shopnames = dd.xpath('div[2]/div[1]/a[1]/h4/text()').extract() item['shopname'] = shopnames[0] print shopnames[0] shopurls = dd.xpath('div[2]/div[1]/a[1]/@href').extract() item['shopurl'] = 'http://www.dianping.com' + str(shopurls[0]) shoplevels = dd.xpath('div[2]/div[2]/span/@title').extract() item['shoplevel'] = shoplevels[0] commentnums = dd.xpath('div[2]/div[2]/a[1]/b/text()').extract() if len(commentnums) > 0: item['commentnum'] = commentnums[0] else: item['commentnum'] = '0' avgcosts = dd.xpath('div[2]/div[2]/a[2]/b/text()').extract() if len(avgcosts) > 0: item['avgcost'] = filter(str.isdigit, str(avgcosts[0])) else: item['avgcost'] = '0' tastes = dd.xpath('div[2]/span/span[1]/b/text()').extract() if len(tastes) > 0: item['taste'] = tastes[0] else: item['taste'] = '0' envis = dd.xpath('div[2]/span/span[2]/b/text()').extract() if len(envis) > 0: item['envi'] = envis[0] else: item['envi'] = '0' services = dd.xpath('div[2]/span/span[3]/b/text()').extract() if len(services) > 0: item['service'] = services[0] else: item['service'] = '0' foodtypes = dd.xpath('div[2]/div[3]/a[1]/span/text()').extract() item['foodtype'] = foodtypes[0] locs = dd.xpath('div[2]/div[3]/a[2]/span/text()').extract() item['loc'] = locs[0] yield item
def parse_0(self, response): item = DianpingItem() selector = Selector(response) div = selector.xpath('//div[@id="shop-all-list"]/ul/li') for dd in div: photourls = dd.xpath('div[1]/a[1]/img/@data-src').extract() item['photourl'] = photourls[0] print photourls[0] shopnames = dd.xpath('div[2]/div[1]/a[1]/h4/text()').extract() item['shopname'] = shopnames[0] print shopnames[0] shopurls = dd.xpath('div[2]/div[1]/a[1]/@href').extract() item['shopurl'] = 'http://www.dianping.com' + str(shopurls[0]) print 'http://www.dianping.com' + str(shopurls[0]) shoplevels = dd.xpath('div[2]/div[2]/span/@title').extract() item['shoplevel'] = shoplevels[0] commentnums = dd.xpath('div[2]/div[2]/a[1]/b/text()').extract() if len(commentnums) > 0: item['commentnum'] = commentnums[0] else: item['commentnum'] = '0' avgcosts = dd.xpath('div[2]/div[2]/a[2]/b/text()').extract() if len(avgcosts) > 0: item['avgcost'] = filter(str.isdigit, str(avgcosts[0])) else: item['avgcost'] = '0' foodtypes = dd.xpath('div[2]/div[3]/a[1]/span/text()').extract() item['foodtype'] = foodtypes[0] locs = dd.xpath('div[2]/div[3]/a[2]/span/text()').extract() item['loc'] = locs[0]
def parse_info(self, response): print('Here is response!!') print response item = DianpingItem() selector = Selector(response) div = selector.xpath('//div[@id="basic-info"]') short_div = selector.xpath('//div[@class="breadcrumb"]') pic = selector.xpath( '//a[@class="J_main-photo"]/img/@src').extract_first() item['pic'] = pic foodtype = short_div.xpath('a[3]/text()').extract_first() temp = foodtype[13:] item['foodtype'] = temp[:-9] loc = short_div.xpath('a[2]/text()').extract_first() temp = loc[13:] item['loc'] = temp[:-9] shopname = div.xpath('h1/text()').extract_first() temp = shopname[1:] item['shopname'] = temp[:-1] print shopname shopurl = response.url item['shopurl'] = shopurl item['ID'] = shopurl[29:] shoplevelstr = div.xpath('div[1]/span[1]/@class').extract_first() shoplevel = shoplevelstr[-2] + '.' + shoplevelstr[-1] item['shoplevel'] = shoplevel avgcost = div.xpath('div[1]/span[3]/text()').extract_first() item['avgcost'] = avgcost taste = div.xpath('div[1]/span[4]/span[1]/text()').extract_first() item['taste'] = taste[3:] envi = div.xpath('div[1]/span[4]/span[2]/text()').extract_first() item['envi'] = envi[3:] service = div.xpath('div[1]/span[4]/span[3]/text()').extract_first() item['service'] = service[3:] street_address = div.xpath('div[2]/span[2]/@title').extract_first() item['street_address'] = street_address tel = div.xpath('p/span[2]/text()').extract_first() item['tel'] = tel div_comments = selector.xpath('//ul[@class="comment-list J-list"]/li') comments = [] for comment in div_comments: flag = comment.xpath('div/div/@class').extract_first() if flag == 'photos': context = comment.xpath('div/p[2]/text()').extract_first() if context != '': print context comments.append(context) elif flag == 'info J-info-short': context = comment.xpath('div/div[2]/p/text()').extract_first() if context != '': print context comments.append(context) item['comments'] = comments yield item
def parse_list(self, response): item_loader = ItemLoader(item=DianpingItem(), response=response) selector = Selector(response) div = selector.xpath('//div[@id="shop-all-list"]/ul/li') for dd in div: # shopnames = dd.xpath('div[2]/div[1]/a[1]/h4/text()').extract() # item['shopname'] = shopnames[0] # print shopnames[0] # # shopurls = dd.xpath('div[2]/div[1]/a[1]/@href').extract() # item['shopurl'] = 'http://www.dianping.com' + str(shopurls[0]) # # shoplevels = dd.xpath('div[2]/div[2]/span/@title').extract() # item['shoplevel'] = shoplevels[0] # # commentnums = dd.xpath('div[2]/div[2]/a[1]/b/text()').extract() # if len(commentnums) > 0: # item['commentnum'] = commentnums[0] # else: # item['commentnum'] = '0' # # avgcosts = dd.xpath('div[2]/div[2]/a[2]/b/text()').extract() # # if len(avgcosts) > 0: # item['avgcost'] = filter(str.isdigit, str(avgcosts[0])) # # else: # item['avgcost'] = '0' # # tastes = dd.xpath('div[2]/span/span[1]/b/text()').extract() # if len(tastes) > 0: # item['taste'] = tastes[0] # else: # item['taste'] = '0' # # envis = dd.xpath('div[2]/span/span[2]/b/text()').extract() # if len(envis) > 0: # item['envi'] = envis[0] # else: # item['envi'] = '0' # # services = dd.xpath('div[2]/span/span[3]/b/text()').extract() # if len(services) > 0: # item['service'] = services[0] # else: # item['service'] = '0' # # foodtypes = dd.xpath('div[2]/div[3]/a[1]/span/text()').extract() # item['foodtype'] = foodtypes[0] # # locs = dd.xpath('div[2]/div[3]/a[2]/span/text()').extract() # item['loc'] = locs[0] item_loader.add_css("shop_name", ".txt .tit a h4::text") item_loader.add_css("shop_url", ".txt .tit a::attr(href)") item_loader.add_css("shop_lev", ".txt .comment span::attr(title)") item_loader.add_css("comment_num", ".txt .comment .review-num b::text") item_loader.add_css("avg_cost", ".txt .comment .mean-price b::text") item_loader.add_css("shop_taste", ".txt .comment-list span[0] b::text") item_loader.add_css("shop_env", ".txt .comment-list span[1] b::text") item_loader.add_css("shop_service", ".txt .comment-list span[2] b::text") item_loader.add_css("shop_tag", "txt .tag-addr a[0] span::text") item_loader.add_css("shop_tag_addr", ".txt .tag-addr a[1] span::text") item_loader.add_css("shop_addr", ".txt .tag-addr a[2] span::text") result_item = item_loader.load_item() yield result_item