示例#1
0
    def parse_item(self, response):

        cityName=response.xpath('//div[@id="curCity"]/span[1]/text()').extract_first()
        cityName=normCityAreaName(cityName)
        if not cityName or not judgeCityName(self.conn.cursor(),cityName):
            return
        cityAreaName=response.xpath('//div[@class="newhouse_details fl "]/ul/li[3]/text()').extract_first('')
        cityAreaName=normCityAreaName(cityAreaName.split(' ')[0])
        if not cityAreaName or not judgeAreaName(self.conn.cursor(),cityAreaName):
            return

        communityName=response.xpath('//div[@class="house_title"]/h1/text()').extract_first()
        communityCoverUrl=response.xpath('//div[@class="album_detail fl"]/a/img/@src').extract_first()
        if not communityName or not communityCoverUrl:
            return

        communityAreaData=judgeNewCommunity(self.conn.cursor(),communityName,cityAreaName,cityName,
                                            communityHomeUrl=response.url)
        if not communityAreaData:
            return

        kaipanTime=response.xpath('//div[@class="discription_more p_15"]/p[3]/text()').extract_first()
        kaipanTime=kaipanTimeParse(kaipanTime)
        if kaipanTime is False:
            return
        dizhi=response.xpath('//div[@class="discription_more p_15"]/p[last()]/em/text()').extract_first('').strip()
        wuye=response.xpath('//div[@class="discription_more p_15"]/p[last()-1]/em/text()').extract_first('').strip()
        kaifashang=response.xpath('//div[@class="discription_more p_15"]/p[5]/text()').extract_first('').strip()
        jieshao=response.xpath('//div[@id="lightspot"]/@content').extract_first('').strip()
        items = SpiderssetItem()
        items['communityAreaData']=communityAreaData
        items['communityCoverUrl']=re.sub('\d{3}x\d{3}','600x450',communityCoverUrl).strip()  # 这个链接有 \n \r 等
        items['comNumber']=self.comNumber
        items['spidersName']=self.name
        items['communityName']=communityName
        items['cityName']=cityName
        items['cityAreaName']=cityAreaName
        items['kaipanTime']=kaipanTime
        items['dizhi']=dizhi
        items['wuye']=wuye
        items['kaifashang']=kaifashang
        items['jieshao']=jieshao
        items['communityHomeUrl']=response.url
        communityHouseTypeUrl=response.xpath('//li[@id="noCur"]/a/@href').extract_first()
        if communityHouseTypeUrl:
            communityHouseTypeUrl=response.urljoin(communityHouseTypeUrl)
            items['communityHouseTypeUrl']=communityHouseTypeUrl
            items['houseImgUrl_list']=[]
            items['houseType_list']=[]
            items['houseArea_list']=[]
            yield Request(communityHouseTypeUrl,callback=self.communityHouseTypeParse,meta={'items':items})
        else:
            yield items
示例#2
0
    def parse(self, response):
        if not isinstance(self.comNumber, int):
            raise Exception(
                '---------- 没有设置网站编号----- comNumber ----------------- ')
        self.conn = connectDataBase()
        if not createSaveHouseImgPath(self.name):
            ###文件夹创建失败  结束爬虫
            return

        # 解析给网站的所有城市以及Url
        cityName_list = response.xpath(
            '//div[@class="leter_list clearfix"]/ul/li/a/text()').extract()
        cityStartUrl_list = response.xpath(
            '//div[@class="leter_list clearfix"]/ul/li/a/@href').extract()
        if len(cityName_list) != len(cityStartUrl_list):
            print 'len(cityName_list)!=len(cityStartUrl_list)'
            return
        # N=len(cityName_list)
        for i, cityName in enumerate(cityName_list):
            cityName = normCityAreaName(cityName)
            if not judgeCityName(self.conn.cursor(), cityName):  # 判断是否存在这个城市
                continue
            items = SpiderssetItem()
            items['cityName'] = cityName
            items['spidersName'] = self.name
            items['comNumber'] = self.comNumber
            # cityStartUrl=cityStartUrl_list[i]+'/loupan/'
            # items['cityStartUrl']=cityStartUrl
            yield Request(cityStartUrl_list[i],
                          callback=self.communityListUrlParse,
                          meta={'items': items})
示例#3
0
    def parse(self, response):
        if not isinstance(self.comNumber,int):
            raise Exception('---------- 没有设置网站编号----- comNumber ----------------- ')
        self.conn=connectDataBase()
        if not createSaveHouseImgPath(self.name):
            ###文件夹创建失败  结束爬虫
            return

        # 解析给网站的所有城市以及Url
        cityName_list=response.xpath('//div[@class="leter_list clearfix"]/ul/li/a/text()').extract()
        cityStartUrl_list=response.xpath('//div[@class="leter_list clearfix"]/ul/li/a/@href').extract()
        if len(cityName_list)!=len(cityStartUrl_list):
            print 'len(cityName_list)!=len(cityStartUrl_list)'
            return
        # N=len(cityName_list)
        for i,cityName in enumerate(cityName_list):
            cityName=normCityAreaName(cityName)
            if not judgeCityName(self.conn.cursor(),cityName):  # 判断是否存在这个城市
                continue
            items=SpiderssetItem()
            items['cityName']=cityName
            items['spidersName']=self.name
            items['comNumber']=self.comNumber
            # cityStartUrl=cityStartUrl_list[i]+'/loupan/'
            # items['cityStartUrl']=cityStartUrl
            yield Request(cityStartUrl_list[i],callback=self.communityListUrlParse,meta={'items':items})
示例#4
0
    def parse(self, response):
        if not isinstance(self.comNumber,int):
            raise Exception(u'---------- 没有设置网站编号----- comNumber ----------------- ')
        self.conn=connectDataBase(self.name)
        if not createSaveHouseImgPath(self.name):
            ###文件夹创建失败  结束爬虫
            return

        # 解析给网站的所有城市以及Url
        cityName_list=response.xpath('//div[@class="sm_station_hot"]/a/text()').extract()
        cityStartUrl_list=response.xpath('//div[@class="sm_station_hot"]/a/@href').extract()
        if len(cityName_list)!=len(cityStartUrl_list):
            print 'len(cityName_list)!=len(cityStartUrl_list)'
            return
        N=len(cityName_list)
        for i in range(N):
            cityName=normCityAreaName(cityName_list[i])
            if not judgeCityName(self.conn.cursor(),cityName_list[i]):  # 判断是否存在这个城市
                continue
            items=SpiderssetItem()
            items['cityName']=cityName
            cityStartUrl=cityStartUrl_list[i].replace('://','://newhouse.').replace('index.html','house/')
            #http://newhouse.nj.house365.com/house/

            items['cityStartUrl']=cityStartUrl
            yield Request(cityStartUrl,callback=self.cityAreaParse,meta={'items':items})
示例#5
0
    def parse(self, response):
        if not isinstance(self.comNumber, int):
            raise Exception(
                u'---------- 没有设置网站编号 comNumber 参数 ----------------- ')
        self.conn = connectDataBase()
        if not createSaveHouseImgPath(self.name):
            ###文件夹创建失败  结束爬虫
            return

        # 解析给网站的所有城市以及Url
        cityName_list = response.xpath(
            '//div[@class="topnav-sub city"][1]/a/text()').extract()
        cityStartUrl_list = response.xpath(
            '//div[@class="topnav-sub city"][1]/a/@href').extract()
        # cityStartUrl_list=filter(lambda x:'xkhouse' in x,cityStartUrl_list)
        if len(cityName_list) != len(cityStartUrl_list):
            print 'len(cityName_list)!=len(cityStartUrl_list)'
            return
        N = len(cityName_list)
        for i in range(N):
            cityName = normCityAreaName(cityName_list[i])
            if not judgeCityName(self.conn.cursor(),
                                 cityName_list[i]):  # 判断是否存在这个城市
                continue
            items = SpiderssetItem()
            items['cityName'] = cityName
            cityStartUrl = cityStartUrl_list[i].replace(
                '://', '://newhouse.') + 'loupan/'
            items['cityStartUrl'] = cityStartUrl
            yield Request(cityStartUrl,
                          callback=self.cityAreaParse,
                          meta={'items': items})
示例#6
0
    def parse(self, response):
        if not isinstance(self.comNumber, int):
            raise Exception(
                u'---------- 没有设置网站编号----- comNumber ----------------- ')
        self.conn = connectDataBase(self.name)
        if not createSaveHouseImgPath(self.name):
            ###文件夹创建失败  结束爬虫
            return

        # 解析给网站的所有城市以及Url
        cityName_list = response.xpath(
            '//div[@class="sm_station_hot"]/a/text()').extract()
        cityStartUrl_list = response.xpath(
            '//div[@class="sm_station_hot"]/a/@href').extract()
        if len(cityName_list) != len(cityStartUrl_list):
            print 'len(cityName_list)!=len(cityStartUrl_list)'
            return
        N = len(cityName_list)
        for i in range(N):
            cityName = normCityAreaName(cityName_list[i])
            if not judgeCityName(self.conn.cursor(),
                                 cityName_list[i]):  # 判断是否存在这个城市
                continue
            items = SpiderssetItem()
            items['cityName'] = cityName
            cityStartUrl = cityStartUrl_list[i].replace(
                '://', '://newhouse.').replace('index.html', 'house/')
            #http://newhouse.nj.house365.com/house/

            items['cityStartUrl'] = cityStartUrl
            yield Request(cityStartUrl,
                          callback=self.cityAreaParse,
                          meta={'items': items})
示例#7
0
    def parse(self, response):
        if not isinstance(self.comNumber,int):
            raise Exception(u'---------- 没有设置网站编号 comNumber 参数 ----------------- ')
        self.conn=connectDataBase()
        if not createSaveHouseImgPath(self.name):
            ###文件夹创建失败  结束爬虫
            return

        # 解析给网站的所有城市以及Url
        cityName_list=response.xpath('//div[@class="topnav-sub city"][1]/a/text()').extract()
        cityStartUrl_list=response.xpath('//div[@class="topnav-sub city"][1]/a/@href').extract()
        # cityStartUrl_list=filter(lambda x:'xkhouse' in x,cityStartUrl_list)
        if len(cityName_list)!=len(cityStartUrl_list):
            print 'len(cityName_list)!=len(cityStartUrl_list)'
            return
        N=len(cityName_list)
        for i in range(N):
            cityName=normCityAreaName(cityName_list[i])
            if not judgeCityName(self.conn.cursor(),cityName_list[i]):  # 判断是否存在这个城市
                continue
            items=SpiderssetItem()
            items['cityName']=cityName
            cityStartUrl=cityStartUrl_list[i].replace('://','://newhouse.')+'loupan/'
            items['cityStartUrl']=cityStartUrl
            yield Request(cityStartUrl,callback=self.cityAreaParse,meta={'items':items})
示例#8
0
 def cityAreaParse(self,response):
     # 解析摸个城市的所有区域的Url
     cityAreaName_list=response.xpath('//div[@id="dict_key"]/dl[1]/dd/ul[1]/li/a/text()').extract()
     cityAreaStartUrl_list=response.xpath('//div[@id="dict_key"]/dl[1]/dd/ul[1]/li/a/@href').extract()
     if len(cityAreaName_list)!=len(cityAreaStartUrl_list):
         return
     items= response.meta["items"]
     for i,cityAreaName in enumerate(cityAreaName_list):
         cityAreaName=normCityAreaName(cityAreaName)
         if not judgeAreaName(self.conn.cursor(),cityAreaName):
             continue
         items['cityAreaName']=cityAreaName   #  例如  松江区
         cityAreaStartUrl=cityAreaStartUrl_list[i]
         items['cityAreaStartUrl']=cityAreaStartUrl #  http://sh.jiwu.com/loupan/list-qa14137.html
         yield Request(cityAreaStartUrl,callback=self.communityParse,meta={'items':items})
示例#9
0
 def cityAreaParse(self,response):
     # 解析摸个城市的所有区域的Url
     cityAreaName_list=response.xpath('//a[@class="clickStatistics_xfsxqs"]/text()').extract()  #[1:]#第一个是 "全部"
     cityAreaStartUrl_list=response.xpath('//a[@class="clickStatistics_xfsxqs"]/@data-value').extract()#[1:]
     if len(cityAreaName_list)!=len(cityAreaStartUrl_list):
         return
     items= response.meta["items"]
     for i,cityAreaName in enumerate(cityAreaName_list):
         cityAreaName=normCityAreaName(cityAreaName)
         if not judgeAreaName(self.conn.cursor(),cityAreaName):
             continue
         items['cityAreaName']=cityAreaName   #  例如  松江区
         cityAreaStartUrl=response.url+"dist-%s_p-1"%cityAreaStartUrl_list[i]
         # 拼接 http://newhouse.nj.house365.com/house/dist-12
         items['cityAreaStartUrl']=cityAreaStartUrl #  http://sh.jiwu.com/loupan/list-qa14137.html
         yield Request(cityAreaStartUrl,callback=self.communityParse,meta={'items':items})
示例#10
0
 def cityAreaParse(self,response):
     # 解析摸个城市的所有区域的Url
     cityAreaName_list=response.xpath('//div[@class="filter"]/dl[1]/dd/a/text()').extract()
     cityAreaStartUrl_list=response.xpath('//div[@class="filter"]/dl[1]/dd/a/@href').extract()
     N=len(cityAreaName_list)
     if N!=len(cityAreaStartUrl_list):
         return
     items= response.meta["items"]
     for i in range(N):
         cityAreaName=normCityAreaName(cityAreaName_list[i])
         if not judgeAreaName(self.conn.cursor(),cityAreaName_list[i]):
             continue
         items['cityAreaName']=cityAreaName  #  例如  杨浦
         cityAreaStartUrl=response.urljoin(cityAreaStartUrl_list[i])
         items['cityAreaStartUrl']=cityAreaStartUrl #  http://newhouse.sh.xkhouse.com/loupan/a1670/
         yield Request(cityAreaStartUrl,callback=self.communityParse,meta={'items':items})
示例#11
0
    def cityAreaParse(self,response):
        # 解析摸个城市的所有区域的Url
        cityAreaName_list=response.xpath('//div[@id="qy"]/div[@class="lp-pb-s2"]/ul/li/a/text()').extract()
        cityAreaStartUrl_list=response.xpath('//div[@id="qy"]/div[@class="lp-pb-s2"]/ul/li/a/@href').extract()
        N=len(cityAreaName_list)
        if N!=len(cityAreaStartUrl_list):
            return

        items= response.meta["items"]
        for i in range(N):
            cityAreaName=re.sub('\(\d+\)','',cityAreaName_list[i])#替换 例如 嘉兴(296) 变成 嘉兴
            cityAreaName=normCityAreaName(cityAreaName)
            if not judgeAreaName(self.conn.cursor(),cityAreaName):
                continue
            items['cityAreaName']=cityAreaName   #  例如  松江区
            cityAreaStartUrl=cityAreaStartUrl_list[i]
            items['cityAreaStartUrl']=cityAreaStartUrl #  http://sh.jiwu.com/loupan/list-qa14137.html
            yield Request(cityAreaStartUrl,callback=self.communityParse,meta={'items':items})
示例#12
0
 def cityAreaParse(self, response):
     # 解析摸个城市的所有区域的Url
     cityAreaName_list = response.xpath(
         '//div[@id="dict_key"]/dl[1]/dd/ul[1]/li/a/text()').extract()
     cityAreaStartUrl_list = response.xpath(
         '//div[@id="dict_key"]/dl[1]/dd/ul[1]/li/a/@href').extract()
     if len(cityAreaName_list) != len(cityAreaStartUrl_list):
         return
     items = response.meta["items"]
     for i, cityAreaName in enumerate(cityAreaName_list):
         cityAreaName = normCityAreaName(cityAreaName)
         if not judgeAreaName(self.conn.cursor(), cityAreaName):
             continue
         items['cityAreaName'] = cityAreaName  #  例如  松江区
         cityAreaStartUrl = cityAreaStartUrl_list[i]
         items[
             'cityAreaStartUrl'] = cityAreaStartUrl  #  http://sh.jiwu.com/loupan/list-qa14137.html
         yield Request(cityAreaStartUrl,
                       callback=self.communityParse,
                       meta={'items': items})
示例#13
0
 def cityAreaParse(self, response):
     # 解析摸个城市的所有区域的Url
     cityAreaName_list = response.xpath(
         '//div[@class="filter"]/dl[1]/dd/a/text()').extract()
     cityAreaStartUrl_list = response.xpath(
         '//div[@class="filter"]/dl[1]/dd/a/@href').extract()
     N = len(cityAreaName_list)
     if N != len(cityAreaStartUrl_list):
         return
     items = response.meta["items"]
     for i in range(N):
         cityAreaName = normCityAreaName(cityAreaName_list[i])
         if not judgeAreaName(self.conn.cursor(), cityAreaName_list[i]):
             continue
         items['cityAreaName'] = cityAreaName  #  例如  杨浦
         cityAreaStartUrl = response.urljoin(cityAreaStartUrl_list[i])
         items[
             'cityAreaStartUrl'] = cityAreaStartUrl  #  http://newhouse.sh.xkhouse.com/loupan/a1670/
         yield Request(cityAreaStartUrl,
                       callback=self.communityParse,
                       meta={'items': items})
示例#14
0
 def cityAreaParse(self, response):
     # 解析摸个城市的所有区域的Url
     cityAreaName_list = response.xpath(
         '//a[@class="clickStatistics_xfsxqs"]/text()').extract(
         )  #[1:]#第一个是 "全部"
     cityAreaStartUrl_list = response.xpath(
         '//a[@class="clickStatistics_xfsxqs"]/@data-value').extract(
         )  #[1:]
     if len(cityAreaName_list) != len(cityAreaStartUrl_list):
         return
     items = response.meta["items"]
     for i, cityAreaName in enumerate(cityAreaName_list):
         cityAreaName = normCityAreaName(cityAreaName)
         if not judgeAreaName(self.conn.cursor(), cityAreaName):
             continue
         items['cityAreaName'] = cityAreaName  #  例如  松江区
         cityAreaStartUrl = response.url + "dist-%s_p-1" % cityAreaStartUrl_list[
             i]
         # 拼接 http://newhouse.nj.house365.com/house/dist-12
         items[
             'cityAreaStartUrl'] = cityAreaStartUrl  #  http://sh.jiwu.com/loupan/list-qa14137.html
         yield Request(cityAreaStartUrl,
                       callback=self.communityParse,
                       meta={'items': items})
示例#15
0
    def parse(self, response):
        if not isinstance(self.comNumber,int):
            raise Exception('---------- 没有设置网站编号----- comNumber ----------------- ')
        self.conn=connectDataBase()
        if not createSaveHouseImgPath(self.name):
            ###文件夹创建失败  结束爬虫
            return

        # 解析给网站的所有城市以及Url
        cityName_list=response.xpath('//ul[@class="section-four-a2"]/li/a/text()').extract()
        cityStartUrl_list=response.xpath('//ul[@class="section-four-a2"]/li/a/@href').extract()
        if len(cityName_list)!=len(cityStartUrl_list):
            print 'len(cityName_list)!=len(cityStartUrl_list)'
            return
        N=len(cityName_list)
        for i in range(N):
            cityName=normCityAreaName(cityName_list[i])
            if not judgeCityName(self.conn.cursor(),cityName):  # 判断是否存在这个城市
                continue
            items=SpiderssetItem()
            items['cityName']=cityName
            cityStartUrl=cityStartUrl_list[i]+'/loupan/'
            items['cityStartUrl']=cityStartUrl
            yield Request(cityStartUrl,callback=self.cityAreaParse,meta={'items':items})
示例#16
0
    def parse_item(self, response):

        cityName = response.xpath(
            '//div[@id="curCity"]/span[1]/text()').extract_first()
        cityName = normCityAreaName(cityName)
        if not cityName or not judgeCityName(self.conn.cursor(), cityName):
            return
        cityAreaName = response.xpath(
            '//div[@class="newhouse_details fl "]/ul/li[3]/text()'
        ).extract_first('')
        cityAreaName = normCityAreaName(cityAreaName.split(' ')[0])
        if not cityAreaName or not judgeAreaName(self.conn.cursor(),
                                                 cityAreaName):
            return

        communityName = response.xpath(
            '//div[@class="house_title"]/h1/text()').extract_first()
        communityCoverUrl = response.xpath(
            '//div[@class="album_detail fl"]/a/img/@src').extract_first()
        if not communityName or not communityCoverUrl:
            return

        communityAreaData = judgeNewCommunity(self.conn.cursor(),
                                              communityName,
                                              cityAreaName,
                                              cityName,
                                              communityHomeUrl=response.url)
        if not communityAreaData:
            return

        kaipanTime = response.xpath(
            '//div[@class="discription_more p_15"]/p[3]/text()').extract_first(
            )
        kaipanTime = kaipanTimeParse(kaipanTime)
        if kaipanTime is False:
            return
        dizhi = response.xpath(
            '//div[@class="discription_more p_15"]/p[last()]/em/text()'
        ).extract_first('').strip()
        wuye = response.xpath(
            '//div[@class="discription_more p_15"]/p[last()-1]/em/text()'
        ).extract_first('').strip()
        kaifashang = response.xpath(
            '//div[@class="discription_more p_15"]/p[5]/text()').extract_first(
                '').strip()
        jieshao = response.xpath(
            '//div[@id="lightspot"]/@content').extract_first('').strip()
        items = SpiderssetItem()
        items['communityAreaData'] = communityAreaData
        items['communityCoverUrl'] = re.sub(
            '\d{3}x\d{3}', '600x450',
            communityCoverUrl).strip()  # 这个链接有 \n \r 等
        items['comNumber'] = self.comNumber
        items['spidersName'] = self.name
        items['communityName'] = communityName
        items['cityName'] = cityName
        items['cityAreaName'] = cityAreaName
        items['kaipanTime'] = kaipanTime
        items['dizhi'] = dizhi
        items['wuye'] = wuye
        items['kaifashang'] = kaifashang
        items['jieshao'] = jieshao
        items['communityHomeUrl'] = response.url
        communityHouseTypeUrl = response.xpath(
            '//li[@id="noCur"]/a/@href').extract_first()
        if communityHouseTypeUrl:
            communityHouseTypeUrl = response.urljoin(communityHouseTypeUrl)
            items['communityHouseTypeUrl'] = communityHouseTypeUrl
            items['houseImgUrl_list'] = []
            items['houseType_list'] = []
            items['houseArea_list'] = []
            yield Request(communityHouseTypeUrl,
                          callback=self.communityHouseTypeParse,
                          meta={'items': items})
        else:
            yield items