def parse_item(self, response): cityName=response.xpath('//div[@id="curCity"]/span[1]/text()').extract_first() cityName=normCityAreaName(cityName) if not cityName or not judgeCityName(self.conn.cursor(),cityName): return cityAreaName=response.xpath('//div[@class="newhouse_details fl "]/ul/li[3]/text()').extract_first('') cityAreaName=normCityAreaName(cityAreaName.split(' ')[0]) if not cityAreaName or not judgeAreaName(self.conn.cursor(),cityAreaName): return communityName=response.xpath('//div[@class="house_title"]/h1/text()').extract_first() communityCoverUrl=response.xpath('//div[@class="album_detail fl"]/a/img/@src').extract_first() if not communityName or not communityCoverUrl: return communityAreaData=judgeNewCommunity(self.conn.cursor(),communityName,cityAreaName,cityName, communityHomeUrl=response.url) if not communityAreaData: return kaipanTime=response.xpath('//div[@class="discription_more p_15"]/p[3]/text()').extract_first() kaipanTime=kaipanTimeParse(kaipanTime) if kaipanTime is False: return dizhi=response.xpath('//div[@class="discription_more p_15"]/p[last()]/em/text()').extract_first('').strip() wuye=response.xpath('//div[@class="discription_more p_15"]/p[last()-1]/em/text()').extract_first('').strip() kaifashang=response.xpath('//div[@class="discription_more p_15"]/p[5]/text()').extract_first('').strip() jieshao=response.xpath('//div[@id="lightspot"]/@content').extract_first('').strip() items = SpiderssetItem() items['communityAreaData']=communityAreaData items['communityCoverUrl']=re.sub('\d{3}x\d{3}','600x450',communityCoverUrl).strip() # 这个链接有 \n \r 等 items['comNumber']=self.comNumber items['spidersName']=self.name items['communityName']=communityName items['cityName']=cityName items['cityAreaName']=cityAreaName items['kaipanTime']=kaipanTime items['dizhi']=dizhi items['wuye']=wuye items['kaifashang']=kaifashang items['jieshao']=jieshao items['communityHomeUrl']=response.url communityHouseTypeUrl=response.xpath('//li[@id="noCur"]/a/@href').extract_first() if communityHouseTypeUrl: communityHouseTypeUrl=response.urljoin(communityHouseTypeUrl) items['communityHouseTypeUrl']=communityHouseTypeUrl items['houseImgUrl_list']=[] items['houseType_list']=[] items['houseArea_list']=[] yield Request(communityHouseTypeUrl,callback=self.communityHouseTypeParse,meta={'items':items}) else: yield items
def parse(self, response): if not isinstance(self.comNumber, int): raise Exception( '---------- 没有设置网站编号----- comNumber ----------------- ') self.conn = connectDataBase() if not createSaveHouseImgPath(self.name): ###文件夹创建失败 结束爬虫 return # 解析给网站的所有城市以及Url cityName_list = response.xpath( '//div[@class="leter_list clearfix"]/ul/li/a/text()').extract() cityStartUrl_list = response.xpath( '//div[@class="leter_list clearfix"]/ul/li/a/@href').extract() if len(cityName_list) != len(cityStartUrl_list): print 'len(cityName_list)!=len(cityStartUrl_list)' return # N=len(cityName_list) for i, cityName in enumerate(cityName_list): cityName = normCityAreaName(cityName) if not judgeCityName(self.conn.cursor(), cityName): # 判断是否存在这个城市 continue items = SpiderssetItem() items['cityName'] = cityName items['spidersName'] = self.name items['comNumber'] = self.comNumber # cityStartUrl=cityStartUrl_list[i]+'/loupan/' # items['cityStartUrl']=cityStartUrl yield Request(cityStartUrl_list[i], callback=self.communityListUrlParse, meta={'items': items})
def parse(self, response): if not isinstance(self.comNumber,int): raise Exception('---------- 没有设置网站编号----- comNumber ----------------- ') self.conn=connectDataBase() if not createSaveHouseImgPath(self.name): ###文件夹创建失败 结束爬虫 return # 解析给网站的所有城市以及Url cityName_list=response.xpath('//div[@class="leter_list clearfix"]/ul/li/a/text()').extract() cityStartUrl_list=response.xpath('//div[@class="leter_list clearfix"]/ul/li/a/@href').extract() if len(cityName_list)!=len(cityStartUrl_list): print 'len(cityName_list)!=len(cityStartUrl_list)' return # N=len(cityName_list) for i,cityName in enumerate(cityName_list): cityName=normCityAreaName(cityName) if not judgeCityName(self.conn.cursor(),cityName): # 判断是否存在这个城市 continue items=SpiderssetItem() items['cityName']=cityName items['spidersName']=self.name items['comNumber']=self.comNumber # cityStartUrl=cityStartUrl_list[i]+'/loupan/' # items['cityStartUrl']=cityStartUrl yield Request(cityStartUrl_list[i],callback=self.communityListUrlParse,meta={'items':items})
def parse(self, response): if not isinstance(self.comNumber,int): raise Exception(u'---------- 没有设置网站编号----- comNumber ----------------- ') self.conn=connectDataBase(self.name) if not createSaveHouseImgPath(self.name): ###文件夹创建失败 结束爬虫 return # 解析给网站的所有城市以及Url cityName_list=response.xpath('//div[@class="sm_station_hot"]/a/text()').extract() cityStartUrl_list=response.xpath('//div[@class="sm_station_hot"]/a/@href').extract() if len(cityName_list)!=len(cityStartUrl_list): print 'len(cityName_list)!=len(cityStartUrl_list)' return N=len(cityName_list) for i in range(N): cityName=normCityAreaName(cityName_list[i]) if not judgeCityName(self.conn.cursor(),cityName_list[i]): # 判断是否存在这个城市 continue items=SpiderssetItem() items['cityName']=cityName cityStartUrl=cityStartUrl_list[i].replace('://','://newhouse.').replace('index.html','house/') #http://newhouse.nj.house365.com/house/ items['cityStartUrl']=cityStartUrl yield Request(cityStartUrl,callback=self.cityAreaParse,meta={'items':items})
def parse(self, response): if not isinstance(self.comNumber, int): raise Exception( u'---------- 没有设置网站编号 comNumber 参数 ----------------- ') self.conn = connectDataBase() if not createSaveHouseImgPath(self.name): ###文件夹创建失败 结束爬虫 return # 解析给网站的所有城市以及Url cityName_list = response.xpath( '//div[@class="topnav-sub city"][1]/a/text()').extract() cityStartUrl_list = response.xpath( '//div[@class="topnav-sub city"][1]/a/@href').extract() # cityStartUrl_list=filter(lambda x:'xkhouse' in x,cityStartUrl_list) if len(cityName_list) != len(cityStartUrl_list): print 'len(cityName_list)!=len(cityStartUrl_list)' return N = len(cityName_list) for i in range(N): cityName = normCityAreaName(cityName_list[i]) if not judgeCityName(self.conn.cursor(), cityName_list[i]): # 判断是否存在这个城市 continue items = SpiderssetItem() items['cityName'] = cityName cityStartUrl = cityStartUrl_list[i].replace( '://', '://newhouse.') + 'loupan/' items['cityStartUrl'] = cityStartUrl yield Request(cityStartUrl, callback=self.cityAreaParse, meta={'items': items})
def parse(self, response): if not isinstance(self.comNumber, int): raise Exception( u'---------- 没有设置网站编号----- comNumber ----------------- ') self.conn = connectDataBase(self.name) if not createSaveHouseImgPath(self.name): ###文件夹创建失败 结束爬虫 return # 解析给网站的所有城市以及Url cityName_list = response.xpath( '//div[@class="sm_station_hot"]/a/text()').extract() cityStartUrl_list = response.xpath( '//div[@class="sm_station_hot"]/a/@href').extract() if len(cityName_list) != len(cityStartUrl_list): print 'len(cityName_list)!=len(cityStartUrl_list)' return N = len(cityName_list) for i in range(N): cityName = normCityAreaName(cityName_list[i]) if not judgeCityName(self.conn.cursor(), cityName_list[i]): # 判断是否存在这个城市 continue items = SpiderssetItem() items['cityName'] = cityName cityStartUrl = cityStartUrl_list[i].replace( '://', '://newhouse.').replace('index.html', 'house/') #http://newhouse.nj.house365.com/house/ items['cityStartUrl'] = cityStartUrl yield Request(cityStartUrl, callback=self.cityAreaParse, meta={'items': items})
def parse(self, response): if not isinstance(self.comNumber,int): raise Exception(u'---------- 没有设置网站编号 comNumber 参数 ----------------- ') self.conn=connectDataBase() if not createSaveHouseImgPath(self.name): ###文件夹创建失败 结束爬虫 return # 解析给网站的所有城市以及Url cityName_list=response.xpath('//div[@class="topnav-sub city"][1]/a/text()').extract() cityStartUrl_list=response.xpath('//div[@class="topnav-sub city"][1]/a/@href').extract() # cityStartUrl_list=filter(lambda x:'xkhouse' in x,cityStartUrl_list) if len(cityName_list)!=len(cityStartUrl_list): print 'len(cityName_list)!=len(cityStartUrl_list)' return N=len(cityName_list) for i in range(N): cityName=normCityAreaName(cityName_list[i]) if not judgeCityName(self.conn.cursor(),cityName_list[i]): # 判断是否存在这个城市 continue items=SpiderssetItem() items['cityName']=cityName cityStartUrl=cityStartUrl_list[i].replace('://','://newhouse.')+'loupan/' items['cityStartUrl']=cityStartUrl yield Request(cityStartUrl,callback=self.cityAreaParse,meta={'items':items})
def cityAreaParse(self,response): # 解析摸个城市的所有区域的Url cityAreaName_list=response.xpath('//div[@id="dict_key"]/dl[1]/dd/ul[1]/li/a/text()').extract() cityAreaStartUrl_list=response.xpath('//div[@id="dict_key"]/dl[1]/dd/ul[1]/li/a/@href').extract() if len(cityAreaName_list)!=len(cityAreaStartUrl_list): return items= response.meta["items"] for i,cityAreaName in enumerate(cityAreaName_list): cityAreaName=normCityAreaName(cityAreaName) if not judgeAreaName(self.conn.cursor(),cityAreaName): continue items['cityAreaName']=cityAreaName # 例如 松江区 cityAreaStartUrl=cityAreaStartUrl_list[i] items['cityAreaStartUrl']=cityAreaStartUrl # http://sh.jiwu.com/loupan/list-qa14137.html yield Request(cityAreaStartUrl,callback=self.communityParse,meta={'items':items})
def cityAreaParse(self,response): # 解析摸个城市的所有区域的Url cityAreaName_list=response.xpath('//a[@class="clickStatistics_xfsxqs"]/text()').extract() #[1:]#第一个是 "全部" cityAreaStartUrl_list=response.xpath('//a[@class="clickStatistics_xfsxqs"]/@data-value').extract()#[1:] if len(cityAreaName_list)!=len(cityAreaStartUrl_list): return items= response.meta["items"] for i,cityAreaName in enumerate(cityAreaName_list): cityAreaName=normCityAreaName(cityAreaName) if not judgeAreaName(self.conn.cursor(),cityAreaName): continue items['cityAreaName']=cityAreaName # 例如 松江区 cityAreaStartUrl=response.url+"dist-%s_p-1"%cityAreaStartUrl_list[i] # 拼接 http://newhouse.nj.house365.com/house/dist-12 items['cityAreaStartUrl']=cityAreaStartUrl # http://sh.jiwu.com/loupan/list-qa14137.html yield Request(cityAreaStartUrl,callback=self.communityParse,meta={'items':items})
def cityAreaParse(self,response): # 解析摸个城市的所有区域的Url cityAreaName_list=response.xpath('//div[@class="filter"]/dl[1]/dd/a/text()').extract() cityAreaStartUrl_list=response.xpath('//div[@class="filter"]/dl[1]/dd/a/@href').extract() N=len(cityAreaName_list) if N!=len(cityAreaStartUrl_list): return items= response.meta["items"] for i in range(N): cityAreaName=normCityAreaName(cityAreaName_list[i]) if not judgeAreaName(self.conn.cursor(),cityAreaName_list[i]): continue items['cityAreaName']=cityAreaName # 例如 杨浦 cityAreaStartUrl=response.urljoin(cityAreaStartUrl_list[i]) items['cityAreaStartUrl']=cityAreaStartUrl # http://newhouse.sh.xkhouse.com/loupan/a1670/ yield Request(cityAreaStartUrl,callback=self.communityParse,meta={'items':items})
def cityAreaParse(self,response): # 解析摸个城市的所有区域的Url cityAreaName_list=response.xpath('//div[@id="qy"]/div[@class="lp-pb-s2"]/ul/li/a/text()').extract() cityAreaStartUrl_list=response.xpath('//div[@id="qy"]/div[@class="lp-pb-s2"]/ul/li/a/@href').extract() N=len(cityAreaName_list) if N!=len(cityAreaStartUrl_list): return items= response.meta["items"] for i in range(N): cityAreaName=re.sub('\(\d+\)','',cityAreaName_list[i])#替换 例如 嘉兴(296) 变成 嘉兴 cityAreaName=normCityAreaName(cityAreaName) if not judgeAreaName(self.conn.cursor(),cityAreaName): continue items['cityAreaName']=cityAreaName # 例如 松江区 cityAreaStartUrl=cityAreaStartUrl_list[i] items['cityAreaStartUrl']=cityAreaStartUrl # http://sh.jiwu.com/loupan/list-qa14137.html yield Request(cityAreaStartUrl,callback=self.communityParse,meta={'items':items})
def cityAreaParse(self, response): # 解析摸个城市的所有区域的Url cityAreaName_list = response.xpath( '//div[@id="dict_key"]/dl[1]/dd/ul[1]/li/a/text()').extract() cityAreaStartUrl_list = response.xpath( '//div[@id="dict_key"]/dl[1]/dd/ul[1]/li/a/@href').extract() if len(cityAreaName_list) != len(cityAreaStartUrl_list): return items = response.meta["items"] for i, cityAreaName in enumerate(cityAreaName_list): cityAreaName = normCityAreaName(cityAreaName) if not judgeAreaName(self.conn.cursor(), cityAreaName): continue items['cityAreaName'] = cityAreaName # 例如 松江区 cityAreaStartUrl = cityAreaStartUrl_list[i] items[ 'cityAreaStartUrl'] = cityAreaStartUrl # http://sh.jiwu.com/loupan/list-qa14137.html yield Request(cityAreaStartUrl, callback=self.communityParse, meta={'items': items})
def cityAreaParse(self, response): # 解析摸个城市的所有区域的Url cityAreaName_list = response.xpath( '//div[@class="filter"]/dl[1]/dd/a/text()').extract() cityAreaStartUrl_list = response.xpath( '//div[@class="filter"]/dl[1]/dd/a/@href').extract() N = len(cityAreaName_list) if N != len(cityAreaStartUrl_list): return items = response.meta["items"] for i in range(N): cityAreaName = normCityAreaName(cityAreaName_list[i]) if not judgeAreaName(self.conn.cursor(), cityAreaName_list[i]): continue items['cityAreaName'] = cityAreaName # 例如 杨浦 cityAreaStartUrl = response.urljoin(cityAreaStartUrl_list[i]) items[ 'cityAreaStartUrl'] = cityAreaStartUrl # http://newhouse.sh.xkhouse.com/loupan/a1670/ yield Request(cityAreaStartUrl, callback=self.communityParse, meta={'items': items})
def cityAreaParse(self, response): # 解析摸个城市的所有区域的Url cityAreaName_list = response.xpath( '//a[@class="clickStatistics_xfsxqs"]/text()').extract( ) #[1:]#第一个是 "全部" cityAreaStartUrl_list = response.xpath( '//a[@class="clickStatistics_xfsxqs"]/@data-value').extract( ) #[1:] if len(cityAreaName_list) != len(cityAreaStartUrl_list): return items = response.meta["items"] for i, cityAreaName in enumerate(cityAreaName_list): cityAreaName = normCityAreaName(cityAreaName) if not judgeAreaName(self.conn.cursor(), cityAreaName): continue items['cityAreaName'] = cityAreaName # 例如 松江区 cityAreaStartUrl = response.url + "dist-%s_p-1" % cityAreaStartUrl_list[ i] # 拼接 http://newhouse.nj.house365.com/house/dist-12 items[ 'cityAreaStartUrl'] = cityAreaStartUrl # http://sh.jiwu.com/loupan/list-qa14137.html yield Request(cityAreaStartUrl, callback=self.communityParse, meta={'items': items})
def parse(self, response): if not isinstance(self.comNumber,int): raise Exception('---------- 没有设置网站编号----- comNumber ----------------- ') self.conn=connectDataBase() if not createSaveHouseImgPath(self.name): ###文件夹创建失败 结束爬虫 return # 解析给网站的所有城市以及Url cityName_list=response.xpath('//ul[@class="section-four-a2"]/li/a/text()').extract() cityStartUrl_list=response.xpath('//ul[@class="section-four-a2"]/li/a/@href').extract() if len(cityName_list)!=len(cityStartUrl_list): print 'len(cityName_list)!=len(cityStartUrl_list)' return N=len(cityName_list) for i in range(N): cityName=normCityAreaName(cityName_list[i]) if not judgeCityName(self.conn.cursor(),cityName): # 判断是否存在这个城市 continue items=SpiderssetItem() items['cityName']=cityName cityStartUrl=cityStartUrl_list[i]+'/loupan/' items['cityStartUrl']=cityStartUrl yield Request(cityStartUrl,callback=self.cityAreaParse,meta={'items':items})
def parse_item(self, response): cityName = response.xpath( '//div[@id="curCity"]/span[1]/text()').extract_first() cityName = normCityAreaName(cityName) if not cityName or not judgeCityName(self.conn.cursor(), cityName): return cityAreaName = response.xpath( '//div[@class="newhouse_details fl "]/ul/li[3]/text()' ).extract_first('') cityAreaName = normCityAreaName(cityAreaName.split(' ')[0]) if not cityAreaName or not judgeAreaName(self.conn.cursor(), cityAreaName): return communityName = response.xpath( '//div[@class="house_title"]/h1/text()').extract_first() communityCoverUrl = response.xpath( '//div[@class="album_detail fl"]/a/img/@src').extract_first() if not communityName or not communityCoverUrl: return communityAreaData = judgeNewCommunity(self.conn.cursor(), communityName, cityAreaName, cityName, communityHomeUrl=response.url) if not communityAreaData: return kaipanTime = response.xpath( '//div[@class="discription_more p_15"]/p[3]/text()').extract_first( ) kaipanTime = kaipanTimeParse(kaipanTime) if kaipanTime is False: return dizhi = response.xpath( '//div[@class="discription_more p_15"]/p[last()]/em/text()' ).extract_first('').strip() wuye = response.xpath( '//div[@class="discription_more p_15"]/p[last()-1]/em/text()' ).extract_first('').strip() kaifashang = response.xpath( '//div[@class="discription_more p_15"]/p[5]/text()').extract_first( '').strip() jieshao = response.xpath( '//div[@id="lightspot"]/@content').extract_first('').strip() items = SpiderssetItem() items['communityAreaData'] = communityAreaData items['communityCoverUrl'] = re.sub( '\d{3}x\d{3}', '600x450', communityCoverUrl).strip() # 这个链接有 \n \r 等 items['comNumber'] = self.comNumber items['spidersName'] = self.name items['communityName'] = communityName items['cityName'] = cityName items['cityAreaName'] = cityAreaName items['kaipanTime'] = kaipanTime items['dizhi'] = dizhi items['wuye'] = wuye items['kaifashang'] = kaifashang items['jieshao'] = jieshao items['communityHomeUrl'] = response.url communityHouseTypeUrl = response.xpath( '//li[@id="noCur"]/a/@href').extract_first() if communityHouseTypeUrl: communityHouseTypeUrl = response.urljoin(communityHouseTypeUrl) items['communityHouseTypeUrl'] = communityHouseTypeUrl items['houseImgUrl_list'] = [] items['houseType_list'] = [] items['houseArea_list'] = [] yield Request(communityHouseTypeUrl, callback=self.communityHouseTypeParse, meta={'items': items}) else: yield items