def parse_content(self, response): selector = etree.HTML(response.text) cj_list = selector.xpath("//ul[@class='listContent']/li") for cj in cj_list: item = LianjiaItem() item['region'] = self.regions.get(response.meta['region']) href = cj.xpath('./a/@href') if not len(href): continue item['href'] = href[0] content = cj.xpath('.//div[@class="title"]/a/text()') if len(content): content = content[0].split() # 按照空格分割成一个列表 item['name'] = content[0] item['style'] = content[1] item['area'] = content[2] content = cj.xpath('.//div[@class="houseInfo"]/text()') if len(content): content = content[0].split('|') item['orientation'] = content[0] item['decoration'] = content[1] if len(content) == 3: item['elevator'] = content[2] else: item['elevator'] = '无' content = cj.xpath('.//div[@class="positionInfo"]/text()') if len(content): content = content[0].split() item['floor'] = content[0] if len(content) == 2: item['build_year'] = content[1] else: item['build_yaer'] = '无' content = cj.xpath('.//div[@class="dealDate"]/text()') if len(content): item['sign_time'] = content[0] content = cj.xpath('.//div[@class="totalPrice"]/span/text()') if len(content): item['total_price'] = content[0] + '万' content = cj.xpath('.//div[@class="unitPrice"]/span/text()') if len(content): item['unit_price'] = content[0] + '元/平' content = cj.xpath('.//div[@class="dealHouseTxt"]/span/text()') if len(content): for i in content: if i.find("房屋满") != -1: # 找到了返回的是非-1得数,找不到的返回的是-1 item['fangchan_class'] = i elif i.find("近地铁") != -1: item['subway'] = i elif i.find("学") != -1: item['school'] = i yield item
def parse_getitem(self, response, province, city, link_chengjiao, host): title = response.xpath("//h1[@class= 'index_h1']/text()").getall() # 检测人机认证 if not title: n_title = response.xpath( "//div[@class='container']/div/h1/text()").get() if '人机认证' in n_title: logging.warning("人机认证: No title in refresh cache" + link_chengjiao) request = scrapy.Request( link_chengjiao, # meta={'dont_redirect': True, 'handle_httpstatus_list': [302]}, meta={"refresh_cache": True}, dont_filter=True, headers={ 'Host': host, 'Referer': link_chengjiao }, callback=self.parse_getitem, cb_kwargs={ 'province': province, 'city': city, 'link_chengjiao': link_chengjiao, 'host': host }) yield request else: logging.warning("This is a warning: No title" + link_chengjiao) else: # 获取信息 price = response.xpath( "//span[@class='dealTotalPrice']/i/text()").getall() # average = response.xpath("//div[@class='price']//b/text()").getall() msg = response.xpath("//div[@class='msg']//label/text()").getall() content_temp = response.xpath( "//div[@class = 'content']//li/text()").getall() content = [] for c in content_temp: content.append(c.strip()) if title is None or '': logging.warning('warning there is no Title' + link_chengjiao) record_list = [] record_price = response.xpath( "//ul[@class = 'record_list']//span[@class = 'record_price']/text()" ).getall() record_detail = response.xpath( "//ul[@class = 'record_list']//p[@class = 'record_detail']/text()" ).getall() for r in range(len(record_price)): record_list.append(record_price[r]) record_list.extend(record_detail[r].split(",")) # 拼凑在成list 方便写入csv item = [province] + [city] + [ link_chengjiao ] + title + price + msg + content + record_list line = LianjiaItem() line['item'] = item yield line
def parse(self, response): #l = ItemLoader(item = LianjiaItem(),response=response) for i in range( 0, len( response.xpath( "//div[@class='info-panel']/h2/a/text()").extract())): l = ItemLoader(item=LianjiaItem(), response=response) info = response.xpath("//div[@class='info-panel']/h2/a/text()" ).extract()[i].encode('utf-8') local = response.xpath("//div[@class='info-panel']").xpath( ".//span[@class='region']/text()").extract()[i].encode('utf-8') house_layout = response.xpath("//div[@class='info-panel']").xpath( ".//span[@class='zone']//text()").extract()[i].encode('utf-8') house_square = response.xpath("//div[@class='info-panel']").xpath( ".//span[@class='meters']/text()").extract()[i].encode('utf-8') house_orientation = response.xpath( "//div[@class='info-panel']").xpath( ".//div[@class='where']//span/text()").extract()[ (i + 1) * 4 - 1].encode('utf-8') district = response.xpath("//div[@class='info-panel']").xpath( ".//div[@class='con']/a/text()").extract()[i].encode( 'utf-8')[:-6] floor = response.xpath("//div[@class='info-panel']").xpath( ".//div[@class='con']//text()").extract()[(i + 1) * 5 - 3].encode('utf-8') building_year = response.xpath("//div[@class='info-panel']").xpath( ".//div[@class='con']//text()").extract()[(i + 1) * 5 - 1].encode('utf-8') price_month = response.xpath("//div[@class='info-panel']").xpath( ".//span[@class='num']//text()").extract()[(i + 1) * 2 - 2].encode('utf-8') person_views = response.xpath("//div[@class='info-panel']").xpath( ".//span[@class='num']//text()").extract()[(i + 1) * 2 - 1].encode('utf-8') tags = [] for j in range( 0, len( response.xpath("//div[@class='view-label left']") [i].xpath(".//span//text()").extract())): tags.append( response.xpath("//div[@class='view-label left']")[i].xpath( ".//span//text()").extract()[j].encode("utf-8")) l.add_value('info', info) l.add_value('local', local) l.add_value('house_layout', house_layout) l.add_value('house_square', house_square) l.add_value('house_orientation', house_orientation) l.add_value('district', district) l.add_value('floor', floor) l.add_value('building_year', building_year) l.add_value('price_month', price_month) l.add_value('person_views', person_views) l.add_value('tags', tags) print l yield l.load_item()
def parse_item(self, response): item = LianjiaItem() item['title'] = response.xpath('//h1/@title').extract()[0] item['house_type'] = re.findall(r"houseType:'(.*?)',", response.text)[0] item['position'] = re.findall(r"resblockPosition:'(.*?)',", response.text)[0] item['longitude'] = item['position'].split(',')[0] item['latitude'] = item['position'].split(',')[1] item['area'] = re.findall(r"area:'(.*?)',", response.text)[0] item['total_price'] = re.findall(r"totalPrice:'(.*?)',", response.text)[0] item['avg_price'] = re.findall(r"price:'(.*?)',", response.text)[0] item['community'] = re.findall(r"resblockName:'(.*?)',", response.text)[0] base_datail = response.xpath( '//*[@id="introduction"]//ul/li/text()').extract() item['layout'] = base_datail[0] item['floor'] = base_datail[1] # item['area'] = base_datail[2][:-1] if item['house_type'] == '别墅': item['direction'] = base_datail[4] item['decorate'] = base_datail[6] else: item['design'] = base_datail[3] item['direction'] = base_datail[6] item['decorate'] = base_datail[8] item['lift'] = base_datail[10] item['lift_proportion'] = base_datail[9] # item['total_price'] = response.xpath('//span[@class="total"]/text()').extract()[0] # item['avg_price'] = response.xpath('//span[@class="unitPriceValue"]/text()').extract()[0] item['region'] = response.xpath( '//span[@class="info"]/a[1]/text()').extract()[0] item['local'] = response.xpath( '//span[@class="info"]/a[2]/text()').extract()[0] # item['community'] = response.xpath('//div[@class="communityName"]/a[1]/text()').extract()[0] yield item
def parseDetail(self,response): item=LianjiaItem() item['title']=response.xpath('//div[@class="title"]/h1/text()').extract_first() item['community']=response.xpath('//div[@class="communityName"]/a[@class="info"]/text()').extract_first() item['model']=response.xpath('//div[@class="room"]/div[@class="mainInfo"]/text()').extract_first()#户型 item['floor']=response.xpath('//div[@class="room"]/div[@class="subInfo"]/text()').extract_first() item['orientation']=response.xpath('//div[@class="type"]/div[@class="mainInfo"]/text()').extract_first() item['decorate']=response.xpath('//div[@class="type"]/div[@class="subInfo"]/text()').extract_first() item['area']=response.xpath('//div[@class="area"]/div[@class="mainInfo"]/text()').extract_first()#面积 item['buildtime']=response.xpath('//div[@class="area"]/div[@class="subInfo"]/text()').extract_first() item['focus_num']=response.xpath('//span[@id="favCount"]/text()').extract_first()#关注人数 item['watch_num']=response.xpath('//span[@id="cartCount"]/text()').extract_first()#观看人数 item['time']=response.xpath('//div[@class="transaction"]//ul/li[1]/text()').extract_first()#发布时间 item['price']=response.xpath('//span[@class="total"]/text()').extract_first()#价格 item['link']=response.url#详细链接 item['Latitude']=response.xpath('//script[19]/text()').re_first(r"resblockPosition:'(.*?)'")#金维度 item['city']=response.xpath('//span[@class="info"]/a[1]/text()').extract_first()#城区 yield item
def get_info(self, response): # print("++++++++++++++++", item) item = LianjiaItem() # print("++++++++++++++++", item.keys) item['链接'] = response.url item['小区名称'] = response.xpath( '//div[@class="communityName"]//text()').extract()[1] item['所在区域'] = response.xpath( '//div[@class="areaName"]//span[@class="info"]//a//text()' ).extract() item['地铁站'] = response.xpath( '//div[@class="areaName"]/a[@class="supplement"]//text()').extract( ) item['总价'] = response.xpath('//span[@class="total"]//text()').extract() item['单价'] = response.xpath( '//span[@class="unitPriceValue"]//text()').extract()[0] item['建筑时间'] = response.xpath( '//div[@class="area"]//div[@class="subInfo"]//text()').extract( )[0].split('/')[0] base_infos_keys = response.xpath( '//div[@class="base"]//li/span/text()').extract() base_infos = response.xpath( '//div[@class="base"]//li/text()').extract() for key in base_infos_keys: try: item[key] = base_infos[base_infos_keys.index(key)] except: pass transactions_infos_keys = response.xpath( '//div[@class="transaction"]//li/span[@class="label"]//text()' ).extract() transactions_infos = response.xpath( '//div[@class="transaction"]//li//span[2]//text()').extract() for key in transactions_infos_keys: try: item[key] = transactions_infos[transactions_infos_keys.index( key)] except: pass yield item pass
def parse1(self, response): info = Selector(response) house_items = {} community_names = info.xpath(self.xpath1).extract() basic_infos = info.xpath(self.xpath2).extract() locations = info.xpath(self.xpath3).extract() total_prices = info.xpath(self.xpath4).extract() per_flats = info.xpath(self.xpath5).extract() for index in range(len(community_names)): item = LianjiaItem() item['community_name'] = community_names[index] item['basic_info'] = basic_infos[index] item['location'] = locations[index] item['total_price'] = total_prices[index] item['per_flat'] = per_flats[index] house_items[index] = item yield house_items
def pageData(self, response): print("=" * 50) city = response.meta.get("info") detail_li = response.xpath("//ul[@class='sellListContent']/li") for page_li in detail_li: if page_li.xpath("@class").get() == "list_app_daoliu": continue money = page_li.xpath( ".//div[@class='totalPrice']/span/text()").get() money = str(money) + "万" address = page_li.xpath( ".//div[@class='positionInfo']/a/text()").get() #获取到房屋的全部数据,进行分割 house_data = page_li.xpath( ".//div[@class='houseInfo']/text()").get().split("|") #房屋格局 house_pattern = house_data[0] #面积大小 house_size = house_data[1].strip() #装修程度 house_degree = house_data[3].strip() #楼层 house_floor = house_data[4].strip() #单价 price = page_li.xpath( ".//div[@class='unitPrice']/span/text()").get().replace( "单价", "") time.sleep(0.5) item = LianjiaItem(city=city, money=money, address=address, house_pattern=house_pattern, house_size=house_size, house_degree=house_degree, house_floor=house_floor, price=price) yield item