def parse(self, response): #解析页面 item = AnjukeItem() Houses = response.css("#list-content > div.zu-itemmod") for eachHouse in Houses: title = eachHouse.css("div.zu-info > h3 > a::text").extract() address = eachHouse.css("div.zu-info > address > a::text").extract( ) + eachHouse.css("div.zu-info > address::text").extract() #拼接地址 detail = eachHouse.css( "div.zu-info > p.details-item.tag::text").extract() price = eachHouse.css("div.zu-side > p > strong::text").extract() address = "".join(address) #将list中字符串提取出来 yield { 'title': title, 'address': address.replace(' ', '').replace('\n', ''), 'detail': detail, 'price': price, } nextLink = response.css( "div.page-content > div.multi-page > a.aNxt::attr(href)" ) #寻找下一页的url if len(nextLink) != 0: nextLink = nextLink.extract() nextLink = "".join(nextLink) #将list中的字符串提取出来 yield Request(nextLink, callback=self.parse, dont_filter=True) else: return
def parse_newHouse(self, response): sel = Selector(response) item = AnjukeItem() href = response.meta.get('href') # # 房屋信息 new_houses = sel.xpath('//*[@id="container"]/div[2]/div[1]/div[4]/div') for new_house in new_houses: if new_house.xpath('./div/a[1]/h3/span/text()'): item['id'] = new_house.xpath( './@data-link').extract()[0][-11:-5] item['img'] = new_house.xpath('./a/img/@src').extract()[0] item['title'] = new_house.xpath( './div/a[1]/h3/span/text()').extract()[0] item['address'] = new_house.xpath( './div/a[2]/span/text()').extract()[0].replace( '[', '').replace(']', '').replace('\xa0', ' ').strip() if new_house.xpath('./div/a[3]/span/text()'): item['house_type'] = ','.join( new_house.xpath('./div/a[3]/span/text()').extract()) else: item['house_type'] = new_house.xpath( './div/a[3]/text()').extract()[0] item['status_icon'] = ','.join( new_house.xpath('./div/a[4]/div/i/text()').extract()) item['tag'] = ','.join( new_house.xpath('./div/a[4]/div/span/text()').extract()) item['name'] = response.meta.get('name') item['area_name'] = response.meta.get('area_name') item['type_name'] = response.meta.get('type') if new_house.xpath('./a[2]/p[1]/span/text()'): item['price'] = new_house.xpath('./a[2]/p/text()').extract( )[0] + new_house.xpath('./a[2]/p[1]/span/text()').extract( )[0] + new_house.xpath('./a[2]/p[1]/text()').extract()[1] if new_house.xpath('./a[2]/p[2]/text()'): item['tel'] = new_house.xpath( './a[2]/p[2]/text()').extract()[0] else: item['price'] = new_house.xpath( './a[2]/p/text()').extract()[0] item['tel'] = new_house.xpath( './a[2]/p/text()').extract()[-1] yield item if sel.xpath( '//*[@id="container"]/div[2]/div[1]/div[@class="list-page"]/div/a/text()' ): if sel.xpath( '//*[@id="container"]/div[2]/div[1]/div[@class="list-page"]/div/a/text()' ).extract()[-1] == '下一页': yield Request( href[:-3] + 'p' + str(int(response.meta.get('page')) + 1) + '_' + href[-3:], callback=self.parse_newHouse, meta={ 'href': href, 'name': response.meta.get('name'), 'area_name': response.meta.get('area_name'), 'type': response.meta.get('type'), 'page': str(int(response.meta.get('page')) + 1) })
def parse_house_info(self, response): '''This function parses a sample response. Some contracts are mingled with this docstring. @url https://bj.zu.anjuke.com/?kw=%E8%A7%92%E9%97%A8&cw=%E8%A7%92%E9%97%A8 @returns requests 1 100 @scrapes title price trail ''' #gender all the house info item_list = response.xpath('//div[contains(@class,"zu-item")]') next_page = response.xpath( '/html/body/div[5]/div[3]/div[3]/div/i[@class="curr"]//following-sibling::a[not(contains(text(),"下一页"))]//@href' ).extract() for i in item_list: item = AnjukeItem() item['title'] = i.xpath( 'div[@class="zu-info"]//a[1]//@title')[0].extract().strip() item['house_detail_url'] = i.xpath( 'div[@class="zu-info"]//a[1]/@href')[0].extract().strip() base = i.xpath( 'div[@class="zu-info"]//p/text()[1]')[0].extract().strip() square = i.xpath( 'div[@class="zu-info"]//p/text()[2]')[0].extract().strip() floor = i.xpath( 'div[@class="zu-info"]//p/text()[3]')[0].extract().strip() contract = i.xpath( 'div[@class="zu-info"]//p/text()[4]')[0].extract().strip() direction = i.xpath( 'div[@class="zu-info"]//p[2]/span[2]')[0].extract().strip() try: trail = i.xpath('div[@class="zu-info"]//p[2]/span[3]' )[0].extract().strip() except: trail = "None" price = i.xpath('div[@class="zu-side"]//p/strong//text()' )[0].extract().strip() current_page = response.xpath( '/html/body/div[5]/div[3]/div[3]/div/i[@class="curr"]//text()' ).extract() item['base'] = base item['square'] = square item['floor'] = floor item['contract'] = contract item['direction'] = direction item['trail'] = trail item['price'] = price item['current_page'] = current_page yield scrapy.Request(url=item['house_detail_url'], dont_filter=True, meta={'item': item}, callback=self.parse_item) #crawl the nextpage for page in next_page: if page not in self.visited_set: self.visited_set.add(page) yield scrapy.Request( url=page, callback=self.parse_house_info, dont_filter=True, )
def parse2(self, response): selector = Selector(response) item = AnjukeItem() try: item['name'] = selector.xpath( "//div[@class='basic-info']/h1/text()").extract()[0] except Exception, e: item['name'] = "" print e.message
def parse_detail(self, response): house_info = response.xpath('//*[@class="houseInfo-wrap"]') if house_info: l = ItemLoader(AnjukeItem(), house_info) l.add_xpath('mode', '//div/div[2]/dl[1]/dd/text()') l.add_xpath('area', '//div/div[2]/dl[2]/dd/text()') l.add_xpath('floor', '//div/div[2]/dl[4]/dd/text()') l.add_xpath('age', '//div/div[1]/dl[3]/dd/text()') l.add_xpath('price', '//div/div[3]/dl[2]/dd/text()') l.add_xpath('location', '//div/div[1]/dl[1]/dd/a/text()') l.add_xpath('district', '//div/div[1]/dl[2]/dd/p/a[1]/text()') yield l.load_item()
def parse_dir_contents(self, response): item = AnjukeItem() str_nam = response.xpath('//div[@class="firstline clearfix"]//a/text()' ).extract_first().strip() str_nam = str(str_nam) str_nam = str_nam.split("的")[0] #str_nam = str_nam.replace('\\r','').replace('\\n','').replace('\\t','') #str_nam = str_nam.strip() item['name'] = str_nam str_phn = response.xpath('//title/text()').extract() str_phn = str(str_phn) item['phone'] = re.findall(r"1\d{10}", str_phn) yield item
def info(self, response): item = AnjukeItem() selector = scrapy.Selector(response) community = selector.xpath( '//*[@id="content"]/div[2]/div/div/h3/text()').extract()[0] average_price = selector.xpath( '//*[@id="content"]/div[2]/div/div/p/span/em/text()').extract()[0] item['community'] = community item['average_price'] = average_price print(item['community'], item['average_price']) print(type(item['community'])) print(item) print(type(item)) yield item
def parse_item(self, response): itemloader = AnjukeItemLoader(item=AnjukeItem(), response=response) itemloader.add_xpath('title', '//div[@class="lp-tit"]/h1/text()') itemloader.add_xpath('price', '//dd[contains(@class, "price")]/p/em') itemloader.add_xpath('around_price', '//dd[@class="around-price"]/span/text()') itemloader.add_xpath('house_type', '//dd[@class="ajust"]/div[@class="house-item"]/a/text()') itemloader.add_xpath('address', '//span[@class="lpAddr-text"]/text()') itemloader.add_xpath('phone', '//div[contains(@class, "tel-box")]/p/strong/text()') itemloader.add_xpath('opentime', '//p[contains(@class, "info-new")]', re='<label>最新开盘</label>\s+(.*)<a.*') itemloader.add_xpath('completetime', '//p[contains(@class, "info-new")]', re='<label>交房时间</label>\s+(.*)</p>.*') itemloader.add_value('url', response.url) # pattern = re.compile(r'.*m.anjuke.com.*') # if not len(pattern.findall(response.url)) > 0: yield itemloader.load_item()
def parse(self, response): divs = response.xpath( '''//li[@class="list-item"]''') # 使用xpath从response中获取需要的html块 for div in divs: item = AnjukeItem() # 实例化item对象 address = div.xpath( './/span[@class="comm-address"]/@title').extract_first( ) # 楼盘地址和小区名称,由于地址和小区名称在一起,所以下面是拆分 address1 = address[address.index("\xa0\xa0") + 2:] #地址,以“\xa0\xa0”区分,结果:宝山-大华-真北路4333弄 address2 = address1[address1.index("-") + 1:] #第一次以“-”分隔,结果:大华-真北路4333弄 address3 = address2[address2.index("-") + 1:] #第二次以“-”分隔,结果:真北路4333弄 name1 = address[:address.index("\xa0\xa0")] #小区名称 try: type_1 = div.xpath('.//div[@class="details-item"]/span/text()' ).extract_first() # 房子类型比如两房一厅这样子~ except: pass # item['tags'] = div.xpath('.//span[@class="item-tags tag-metro"]/text()').extract() # 网站给楼盘定的标签~ price = div.xpath('.//span[@class="price-det"]/strong/text()' ).extract_first() # 价格 price1 = price + '万' try: area = div.xpath( './/div[@class="details-item"]/span/text()').extract()[1:2] area1 = ''.join(area) #将list转化为string except: pass item['address'] = address3 item['name'] = name1 item['type_'] = type_1 item['price'] = price1 item['area'] = area1 yield item next_ = response.xpath( '//div[@class="multi-page"]/a[@class="aNxt"]/@href').extract_first( ) # 获取下一页的链接 print('-------next----------') print(next_) yield response.follow(url=next_, callback=self.parse) # 将下一页的链接加入爬取队列~~
def parse_item(self, response): for each in response.xpath('//div[@class="zu-info"]'): item = AnjukeItem() titleold = each.xpath('h3').xpath("string(.)").extract()[0] title = self.zhuanma(titleold) link = each.xpath('h3/a/@href').extract()[0] sizeold = each.xpath( 'p[@class="details-item tag"]/b/text()').extract()[0] size = self.zhuanma(sizeold) #content.replace(u'\xa0', u'') item['title'] = title.strip() item['link'] = link.strip() item['size'] = size.strip() yield item
def detail_parse(self, response): items = AnjukeItem() content = response.body area_url = response.meta['area_url'] page = response.meta['page'] city = response.meta['city'] area = response.meta['area'] if 'antispam' in response.url: url = area_url + 'p{}/'.format(page) yield scrapy.Request(url, headers=header, callback=self.detail_parse, meta={ 'city': city, 'area': area, 'page': page, 'area_url': area_url }, dont_filter=True) else: house_info = re.findall('<div class="zu-info">([\s\S]*?)</div>', content) for info in house_info[:-1]: pattern = re.search('title="([\s\S]*?)"[\s\S]*?href="(.*?)"', info) title = pattern.group(1) link = pattern.group(2) items['city'] = city items['area'] = area items['title'] = title items['link'] = link yield items if 'aNxt' in content: page += 1 url = area_url + 'p{}/'.format(page) yield scrapy.Request(url, headers=header, callback=self.detail_parse, meta={ 'city': city, 'area': area, 'page': page, 'area_url': area_url }, dont_filter=True)
def parse(self, response): sel = Selector(response) item = AnjukeItem() item = self._item_init(item) try: fang_info = {'title': '', 'info': '', 'desc': '', 'pic_tab': ''} url = item['url'] = response.url fang_id = item['fang_id'] = (re.search(r'\d+_\d+', url)).group(0) item['body'] = (response.body).decode('gbk').encode('utf8') try: fang_info['title'] = sel.xpath( '//div[@class="mainBoxL"]/div[@class="title"]').extract( )[0] except Exception as e: print Exception, ":", e try: fang_info['info'] = sel.xpath( '//div[@class="houseInfor clearfix"]/div[@class="inforTxt"]' ).extract()[0] except Exception as e: print Exception, ":", e try: fang_info['desc'] = sel.xpath( '//div[@id="hsPro-pos"]/div[@class="describe mt10"]' ).extract()[0] except Exception as e: print Exception, ":", e try: fang_info['pic_tab'] = sel.xpath( '//div[@id="hsPic-pos"]').extract()[0] except Exception as e: print Exception, ":", e m = hashlib.md5() m.update(str(fang_info)) follow_value = m.hexdigest() yield item except Exception as e: print Exception, ":", e
def parse_item(self, response): item_loader = AnjukeItemLoader(item=AnjukeItem(), response=response) item_loader.add_xpath("title", "//h3[@class='long-title']/text()") item_loader.add_xpath("size", "//span[@class='info-tag'][2]/em/text()") item_loader.add_xpath("total_price", "//span[@class='light info-tag']/em/text()") item_loader.add_xpath( "locate", "//div[@class='houseInfo-content']/p/a[1]/text()") meter_price = response.xpath("//div[@class='houseInfo-content']" )[2].xpath("text()").extract_first("") item_loader.add_value("meter_price", meter_price) crawl_time = datetime.now().strftime("%Y-%m-%d %H:%M") item_loader.add_value("crawl_time", crawl_time) anjuke_item = item_loader.load_item() yield anjuke_item
def parse(self, response): item = AnjukeItem() selector = Selector(response) infos = selector.xpath('//div[@class="zu-itemmod "]') for info in infos: url = info.xpath('a/@href').extract() item['url'] = url price = info.xpath('div[2]/p/strong/text()').extract() item['price']= price roomType = info.xpath('div[1]/p[1]/text()[1]').extract() item['roomType']= roomType rentType = info.xpath('div[1]/p[1]/text()[2]').extract() item['rentType']= rentType decoration = info.xpath('div[1]/p[1]/text()[3]').extract() item['decoration']= decoration floor = info.xpath('div[1]/p[1]/text()[4]').extract() item['floor']= floor area = info.xpath('div[1]/address/a/text()').extract() if len(area): item['area']= area[0].strip() address = info.xpath('div[1]/address/text()').extract() if len(address) > 1: item['address']= address[1].strip() title = info.xpath('div[1]/h3/a/text()').extract() item['title']= title yield item for i in range(2,14): nexturl = 'http://sh.zu.anjuke.com/fangyuan/huacao/fx3-p%s/'%i yield Request(nexturl,callback=self.parse)
def parse(self, response): item = AnjukeItem() #所有数据 selector = Selector(response) HouseData = selector.xpath('//*[@id="list-content"]/div') #div[1],div[2]需要舍弃 for eachhouse in HouseData[3:]: house_type = eachhouse.xpath('div[1]/p[1]/text()[1]').extract() rent_type = eachhouse.xpath('div[1]/p[1]/text()[2]').extract() renovation = eachhouse.xpath('div[1]/p[1]/text()[3]').extract() address = eachhouse.xpath('div[1]/address/text()').extract() owner = eachhouse.xpath('div[1]/p[2]/span/text()').extract() price = eachhouse.xpath('div[2]/p/strong/text()').extract() #不要写成/div[2]/p/...没看清坑了自己 if house_type: item['house_type'] = house_type else: item['house_type'] = None if rent_type: item['rent_type'] = rent_type else: item['rent_type'] = None if renovation: item['renovation'] = renovation else: item['renovation'] = None if address: item['address'] = address else: item['address'] = None if owner: item['owner'] = owner else: item['owner'] = None if price: item['price'] = price else: item['price'] = None yield item nextpage = selector.xpath('//div[@class="multi-page"]/a/@href').extract()[-1] #取最后一个href,顺序无法取 print nextpage if nextpage: yield Request(nextpage,callback=self.parse)
def parse_detail(self, response): print("正在下载详情页", response.url) html = response.body.decode("utf-8", 'ignore') info = response.meta['info1'] res = re.search(r'.*area.*?(\[.*?\]).*', html).group(1) if res: res_list = eval(res) history = [] for item in res_list: for _, v in item.items(): history.append(v) info['history'] = '|'.join(history) from items import AnjukeItem item = AnjukeItem() for k in info.keys(): item[k] = info[k] yield item
def parse(self, response): sel = Selector(response) item = AnjukeItem() item = self._item_init(item) try: hourse_info = sel.xpath( '//h4[@class="block-title houseInfo-title"]/span/text()' ).extract()[0] item['anjuke_id'] = (re.search(r"\d{9,}", hourse_info)).group(0) item['deploy_time'] = (re.search( r"\d{4}%s\d{2}%s\d{2}%s" % ('年'.decode("utf-8"), '月'.decode("utf-8"), '日'.decode("utf-8")), hourse_info)).group(0) except Exception as e: print Exception, ":", e try: item['Cur_url'] = response.url except Exception as e: print Exception, ":", e try: item['City'] = (sel.xpath( '//*[@id="content"]/div[1]/a[2]/text()').extract()[0]).replace( '二手房'.decode("utf8"), '') except Exception as e: print Exception, ":", e try: item['District'] = (sel.xpath( '//*[@id="content"]/div[1]/a[3]/text()').extract()[0]).replace( '二手房'.decode("utf8"), '') except Exception as e: print Exception, ":", e try: item['Block'] = (sel.xpath( '//*[@id="content"]/div[1]/a[4]/text()').extract()[0]).replace( '二手房'.decode("utf8"), '') except Exception as e: print Exception, ":", e try: item['Estate'] = sel.xpath( '//*[@id="content"]/div[1]/a[4]/text()').extract()[0] except Exception as e: print Exception, ":", e try: item['Title'] = sel.xpath( '//*[@id="content"]/div[@class="wrapper"]/h3[@class="long-title"]/text()' ).extract()[0] except Exception as e: print Exception, ":", e try: item['Price'] = (re.compile(r'<[^>]+>', re.S)).sub( '', sel.xpath('//*[@id="content"]/div[2]/div[1]/div[1]/span[1]'). extract()[0]) except Exception as e: print Exception, ":", e try: item['Layout'] = (sel.xpath( '//*[@id="content"]/div[2]/div[1]/div[3]/div/div/div[1]/div/dl[dt="%s"]/dd/text()' % '房型:'.decode("utf8")).extract()[0]).replace('\n', '').replace( '\t', '') except Exception as e: print Exception, ":", e try: item['Decoration'] = sel.xpath( '//*[@id="content"]/div[2]/div[1]/div[3]/div/div/div[1]/div/dl[dt="%s"]/dd/text()' % '装修程度:'.decode("utf8")).extract()[0] except Exception as e: print Exception, ":", e try: item['Location'] = (re.compile(r'<[^>]+>', re.S)).sub( '', sel.xpath( '//*[@id="content"]/div[2]/div[1]/div[3]/div/div/div[1]/div/dl[dt="%s"]/dd/p' % '位置:'.decode("utf8")).extract()[0]).replace('\n', '').replace( '\t', '') except Exception as e: print Exception, ":", e try: item['Area'] = sel.xpath( '//*[@id="content"]/div[2]/div[1]/div[3]/div/div/div[1]/div/dl[dt="%s"]/dd/text()' % '面积:'.decode("utf8")).extract()[0] except Exception as e: print Exception, ":", e try: item['Unit_Price'] = sel.xpath( '//*[@id="content"]/div[2]/div[1]/div[3]/div/div/div[1]/div/dl[dt="%s"]/dd/text()' % '房屋单价:'.decode("utf8")).extract()[0] except Exception as e: print Exception, ":", e try: item['Years'] = sel.xpath( '//*[@id="content"]/div[2]/div[1]/div[3]/div/div/div[1]/div/dl[dt="%s"]/dd/text()' % '年代:'.decode("utf8")).extract()[0] except Exception as e: print Exception, ":", e try: item['Orientation'] = sel.xpath( '//*[@id="content"]/div[2]/div[1]/div[3]/div/div/div[1]/div/dl[dt="%s"]/dd/text()' % '朝向:'.decode("utf8")).extract()[0] except Exception as e: print Exception, ":", e try: item['Downpayment'] = (sel.xpath( '//*[@id="content"]/div[2]/div[1]/div[3]/div/div/div[1]/div/dl[dt="%s"]/dd/text()' % '参考首付:'.decode("utf8")).extract()[0]).replace('\n', '').replace( '\t', '') except Exception as e: print Exception, ":", e try: item['Type'] = sel.xpath( '//*[@id="content"]/div[2]/div[1]/div[3]/div/div/div[1]/div/dl[dt="%s"]/dd/text()' % '类型:'.decode("utf8")).extract()[0] except Exception as e: print Exception, ":", e try: item['Floor'] = sel.xpath( '//*[@id="content"]/div[2]/div[1]/div[3]/div/div/div[1]/div/dl[dt="%s"]/dd/text()' % '楼层:'.decode("utf8")).extract()[0] except Exception as e: print Exception, ":", e try: item['Monthly_Payments'] = sel.xpath( '//*[@id="content"]/div[2]/div[1]/div[3]/div/div/div[1]/div/dl[dt="%s"]/dd/span/text()' % '参考月供:'.decode("utf8")).extract()[0] except Exception as e: print Exception, ":", e try: item['Desc'] = (re.compile(r'<[^>]+>', re.S)).sub( '', sel.xpath( '//*[@id="content"]/div[2]/div[1]/div[3]/div/div/div[3]/div/div' ).extract()[0]) except Exception as e: print Exception, ":", e try: item['Agent'] = sel.xpath( '//p[@class="broker-name"]/a/text()').extract()[0] except Exception as e: print Exception, ":", try: item['Agent_Phone'] = (sel.xpath( '//p[@class="broker-mobile"]/text()').extract()[0]).replace( ' ', '') except Exception as e: print Exception, ":", e try: item['Agent_Company'] = sel.xpath( '//div[@class="broker-company"]/a[1]/text()').extract()[0] except Exception as e: print Exception, ":", e yield item
def parse(self, response): print("开始解析第%s页 >>> " % self.page_index) crt_url = response.url print("当前url: {url}".format(url=crt_url)) pnum = extract_page_index(crt_url) print(pnum) cache_crt_page_index(pnum) item = AnjukeItem() print('------------------------------------------------') info_list = response.xpath("//*[@id='houselist-mod-new']/li") for info in info_list: # 标题 title = info.xpath("./div[2]/div[1]/a/text()").extract_first() # 安选验真信息 guarantee_info = info.xpath( "./div[2]/div[1]/em/@title").extract_first() # 链接 link = info.xpath("./div[2]/div[1]/a/@href").extract_first() # 房屋id house_id = extract_house_id(link) # 户型 house_type = info.xpath( "./div[2]/div[2]/span[1]/text()").extract_first() # 面积 area = info.xpath("./div[2]/div[2]/span[2]/text()").extract_first() # 楼层信息 floor_info = info.xpath( "./div[2]/div[2]/span[3]/text()").extract_first() # 建造时间 build_time_info = info.xpath( "./div[2]/div[2]/span[4]/text()").extract_first() # 经纪人姓名 broker_name = info.xpath( "./div[2]/div[2]/span[5]/text()").extract_first() # 地址 address = info.xpath("./div[2]/div[3]/span/text()").extract_first() # 标签信息 tags = [] for tag in info.xpath("./div[2]/div[4]"): tag_str = tag.xpath("./span/text()").extract() tags.extend(tag_str) # 价格 price = info.xpath( "./div[3]/span[1]/strong/text()").extract_first() # 每平米价格 unit_price = info.xpath("./div[3]/span[2]/text()").extract_first() # 赋值到item对象上------------ item['house_id'] = house_id item['title'] = title.strip() if title else '' item['guarantee_info'] = guarantee_info if guarantee_info else '' item['link'] = link if link else '' item['house_type'] = house_type if house_type else '' item['area'] = area if area else '' item['floor_info'] = floor_info if floor_info else '' item[ 'build_time_info'] = build_time_info if build_time_info else '' item['broker_name'] = broker_name if broker_name else '' item['address'] = address.strip() if address else '' item['tags'] = tags if tags else [] item['price'] = price if price else '' item['unit_price'] = unit_price if unit_price else '' yield item # 下一页地址 next_page_url = response.xpath( "//*[@id='content']/div[4]/div[7]/a[@class='aNxt']/@href" ).extract_first() print(next_page_url) if next_page_url is not None: yield scrapy.Request(response.urljoin(next_page_url)) self.page_index += 1
def parse_item(self, response): meta = response.meta if 'callback' not in response.url and response.status == 200: li = response.xpath('//ul[@id="houselist-mod-new"]/li') if li: for i in li: city_name = meta.get('city_name') area = meta.get('area') location = meta.get('location') item = AnjukeItem() item['area'] = area item['location'] = location item['city_name'] = city_name title = i.xpath( './div[@class="house-details"]/div/a/@title').extract( ) base_url = i.xpath( './div[@class="house-details"]/div/a/@href').extract() if base_url: item['base_url'] = ''.join(base_url) else: item['base_url'] = '' if title: item['title'] = ''.join(title) else: item['title'] = '' addr = i.xpath( './div[@class="house-details"]/div[@class="details-item"]/span[@class="comm-address"]/@title' ).extract() if addr: item['addr'] = ''.join(addr) else: item['addr'] = '' sum_price = i.xpath( './div[@class="pro-price"]/span[@class="price-det"]/strong/text()' ).extract() if sum_price: item['sum_price'] = ''.join(sum_price) else: item['sum_price'] = '' unit_price = i.xpath( './div[@class="pro-price"]/span[@class="unit-price"]/text()' ).extract() if unit_price: item['unit_price'] = ''.join(unit_price) else: item['unit_price'] = '' item['url'] = response.url item['dt'] = dt yield item next_url = response.xpath( '//div[@class="multi-page"]/a[@class="aNxt"]/@href').extract() if next_url: city_name = meta.get('city_name') area = meta.get('area') location = meta.get('location') url = ''.join(next_url) yield scrapy.Request(url, meta={ 'url': url, 'city_name': city_name, 'area': area, 'location': location }, callback=self.parse_item, dont_filter=True) else: url = meta.get('url') city_name = meta.get('city_name') area = meta.get('area') location = meta.get('location') if url: yield scrapy.Request(url, meta={ 'url': url, 'city_name': city_name, 'area': area, 'location': location }, callback=self.parse_item, dont_filter=True)
def parse(self, response): item = AnjukeItem() item = self._item_init(item) sel = Selector(response) item['batch_id'] = batch_id = response.meta['batch_id'] item['submit_time'] = submit_time = response.meta['submit_time'] item['schedule_time'] = schedule_time = str( time.strftime("%Y-%m-%d %H:%M:%S", time.localtime(response.meta['schedule_time']))) item['received_time'] = received_time = str( time.strftime("%Y-%m-%d %H:%M:%S", time.localtime(response.meta['received_time']))) item['page_index'] = page_index = response.meta['page_index'] server_time = time.mktime( time.strptime(response.headers['Date'], "%a, %d %b %Y %H:%M:%S %Z")) + 8 * 3600 item['server_time'] = str( time.strftime("%Y-%m-%d %H:%M:%S", time.localtime(server_time))) try: if sel.xpath('//div[@class="list sorry_word"]') != []: if 'retry_count' in response.meta: retry_count = int(response.meta['retry_count']) else: retry_count = 0 if retry_count <= 2: print "retry......" yield Request(url=response.url, method='GET', callback=self.parse, meta={ 'submit_time': submit_time, 'schedule_time': schedule_time, 'received_time': received_time, 'retry_count': retry_count + 1, 'page_index': page_index, 'batch_id': batch_id }) else: return if len(sel.xpath( '//div[@class="houseList"]/dl[@class="list rel"]')) > 30: dl_list = sel.xpath( '//div[@class="houseList"]/dl[@class="list rel"]') for dl_index in range(1, len(dl_list)): try: item['fang_id'] = fang_id = (re.search( r'\d_\d+', (dl_list[dl_index].xpath( './dd[@class="info rel floatr"]/p[@class="title"]/a/@href' ).extract()[0]))).group(0) item['rank'] = rank = 30 * (page_index - 1) + dl_index item['update_tag'] = update_tag = dl_list[ dl_index].xpath( './dd[@class="info rel floatr"]/p[@class="gray6 mt10"]/span[@class="ml10 gray9"]/text()' ).extract()[0] if re.match(r'\d+秒前更新'.decode("utf-8"), update_tag): deviation = int( update_tag.replace('秒前更新'.decode("utf-8"), '')) item['update_time'] = update_time = str( time.strftime( "%Y-%m-%d %H:%M:%S", time.localtime(server_time - deviation))) if re.match(r'\d+分钟前更新'.decode("utf-8"), update_tag): deviation = int( update_tag.replace('分钟前更新'.decode("utf-8"), '')) * 60 item['update_time'] = update_time = str( time.strftime( "%Y-%m-%d %H:%M:%S", time.localtime(server_time - deviation))) if re.match(r'\d+小时前更新'.decode("utf-8"), update_tag): deviation = int( update_tag.replace('小时前更新'.decode("utf-8"), '')) * 3600 item['update_time'] = update_time = str( time.strftime( "%Y-%m-%d %H:%M:%S", time.localtime(server_time - deviation))) if re.match(r'\d+天前更新'.decode("utf-8"), update_tag): deviation = int( update_tag.replace('天前更新'.decode("utf-8"), '')) * 3600 * 24 item['update_time'] = update_time = str( time.strftime( "%Y-%m-%d %H:%M:%S", time.localtime(server_time - deviation))) except Exception as e: print Exception, ":", e yield item else: dl_list = sel.xpath( '//div[@class="houseList"]/dl[@class="list rel"]') for dl_index in range(0, len(dl_list)): try: item['fang_id'] = fang_id = (re.search( r'\d_\d+', (dl_list[dl_index].xpath( './dd[@class="info rel floatr"]/p[@class="title"]/a/@href' ).extract()[0]))).group(0) item['rank'] = rank = 30 * (page_index - 1) + dl_index + 1 item['update_tag'] = update_tag = dl_list[ dl_index].xpath( './dd[@class="info rel floatr"]/p[@class="gray6 mt10"]/span[@class="ml10 gray9"]/text()' ).extract()[0] if re.match(r'\d+秒前更新'.decode("utf-8"), update_tag): deviation = int( update_tag.replace('秒前更新'.decode("utf-8"), '')) item['update_time'] = update_time = str( time.strftime( "%Y-%m-%d %H:%M:%S", time.localtime(server_time - deviation))) if re.match(r'\d+分钟前更新'.decode("utf-8"), update_tag): deviation = int( update_tag.replace('分钟前更新'.decode("utf-8"), '')) * 60 item['update_time'] = update_time = str( time.strftime( "%Y-%m-%d %H:%M:%S", time.localtime(server_time - deviation))) if re.match(r'\d+小时前更新'.decode("utf-8"), update_tag): deviation = int( update_tag.replace('小时前更新'.decode("utf-8"), '')) * 3600 item['update_time'] = update_time = str( time.strftime( "%Y-%m-%d %H:%M:%S", time.localtime(server_time - deviation))) if re.match(r'\d+天前更新'.decode("utf-8"), update_tag): deviation = int( update_tag.replace('天前更新'.decode("utf-8"), '')) * 3600 * 24 item['update_time'] = update_time = str( time.strftime( "%Y-%m-%d %H:%M:%S", time.localtime(server_time - deviation))) except Exception as e: print Exception, ":", e yield item except Exception as e: print Exception, ":", e