Пример #1
0
    def parse_esf(self, response):
        province, city = response.meta.get('info')
        dls = response.xpath('//div[@class="shop_list shop_list_4"]/dl')
        for dl in dls:
            item = ESFHouseItem(province=province, city=city)
            item['name'] = dl.xpath('.//p[@class="add_shop"]/a/@title').get()
            infos = dl.xpath('.//p[@class="tel_shop"]/text()').getall()
            infos = list(map(lambda x: re.sub('\s', '', x), infos))[0:-1]
            for info in infos:
                if '厅' in info:
                    item['rooms'] = info
                elif '㎡' in info:
                    item['area'] = info
                elif '层' in info:
                    item['floor'] = info
                elif '向' in info:
                    item['toward'] = info
                else:
                    item['year'] = info

            item['address'] = dl.xpath(
                './/p[@class="add_shop"]/span/text()').get()

            origin_url = response.urljoin(dl.xpath('.//a/@href').get())
            item['origin_url'] = origin_url
            yield scrapy.Request(url=origin_url,
                                 callback=self.parse_detail,
                                 meta={'info': item})

        next_url = response.xpath('//div[@class="page_al"]/p[1]/a/@href').get()
        yield scrapy.Request(url=response.urljoin(next_url),
                             callback=self.parse_esf,
                             meta={'info': (province, city)})
Пример #2
0
 def parse_esf(self,response):
     province,city = response.meta.get('info')
     dls = response.xpath("//div[@class='houseList']/dl")
     for dl in dls:
         item = ESFHouseItem(province=province,city=city)
         item['name'] = dl.xpath(".//p[@class='mt10']/a/span/text()").get()
         infos = dl.xpath(".//p[@class='mt12']/text()").getall()
         infos = list(map(lambda x:re.sub(r"\s","",x),infos))
         for info in infos:
             if "厅" in info:
                 item['rooms'] = info
             elif '层' in info:
                 item['floor'] = info
             elif '向' in info:
                 item['toward'] = info
             else:
                 item['year'] = info.replace("建筑年代:","")
         item['address'] = dl.xpath(".//p[@class='mt10']/span/@title").get()
         item['area'] = dl.xpath(".//div[contains(@class,'area')]/p/text()").get()
         item['price'] = "".join(dl.xpath(".//div[@class='moreInfo']/p[1]//text()").getall())
         item['unit'] = "".join(dl.xpath(".//div[@class='moreInfo']/p[2]//text()").getall())
         detail_url = dl.xpath(".//p[@class='title']/a/@href").get()
         item['origin_url'] = response.urljoin(detail_url)
         yield item
     next_url = response.xpath("//a[@id='PageControl1_hlk_next']/@href").get()
     yield scrapy.Request(url=response.urljoin(next_url),callback=self.parse_esf,meta={"info":(province,city)})
Пример #3
0
    def parse_esf(self,response):
        province,city = response.meta.get('info')
        dls = response.xpath("//div[contains(@class,'shop_list')]/dl")
        for dl in dls:
            item = ESFHouseItem(province=province,city=city)
            item['name'] = dl.xpath(".//p[@class='add_shop']/a/@title").get()
            infos = dl.xpath(".//p[@class='tel_shop']/text()").getall()
            infos = list(map(lambda x:re.sub(r'\s', '', x),infos))
            for info in infos:
                if "厅" in info:
                    item['rooms'] = info
                elif "层" in info:
                    item['floor'] = info
                elif "向" in info:
                    item['toward'] = info
                elif "m" in info:
                    item['area'] = info
                else:
                    item['year'] = info

            item['address'] = dl.xpath(".//p[@class='add_shop']/span/text()").get()
            prices = ''.join(dl.xpath(".//dd[@class='price_right']/span[1]//text()").getall())
            item['price'] = re.sub(r'\s', '', prices)
            item['unit'] = ''.join(dl.xpath(".//dd[@class='price_right']/span[2]//text()").getall())
            detail_url = dl.xpath(".//h4[@class='clearfix']/a/@href").get()
            origin_url = response.urljoin(detail_url)
            item['origin_url'] = origin_url
            yield item
        next_url = response.xpath("//div[@class='page_al']/p/a/@href").get()
        yield scrapy.Request(url=response.urljoin(next_url),callback=self.parse_esf,meta={"info":(province,city)})
Пример #4
0
    def parse_esf(self, response):
        # 二手房
        province, city_name = response.meta.get('info')
        dls = response.xpath("//div[@class='shop_list shop_list_4']/dl")
        for dl in dls:
            item = ESFHouseItem()
            #提取二手房title
            house_title = dl.xpath('//h4[@class="clearfix"]/a/@title').extract_first()

            if house_title:
                infos = dl.xpath(".//p[@class='tel_shop']/text()").extract()
                infos = list(map(lambda x: re.sub(r"\s", "", x), infos))
                for info in infos:
                    if "厅" in info:
                        item["rooms"] = info
                    elif '层' in info:
                        item["floor"] = info
                    elif '向' in info:
                        item['toward'] = info
                    elif '㎡' in info:
                        item['area'] = info
                    elif '年建' in info:
                        item['build_year'] = re.sub("年建", "", info)

                #省、市
                item['province'] = province
                item['city'] = city_name
                #房子标题介绍
                item['house_title'] = house_title
                #小区名字
                item['house_name'] = dl.xpath('.//p[@class="add_shop"]/a/@title').extract_first()
                #联系人
                item['contacts'] = dl.xpath('.//p[@class="tel_shop"]/span[@class="people_name"]/a/text()').extract_first() if dl.xpath('.//p[@class="tel_shop"]/span[@class="people_name"]/a/text()') else '暂无联系人'
                #地址
                item['address'] = dl.xpath('.//p[@class="add_shop"]/span/text()').extract_first()
                #房屋卖点
                item['tags'] = '/'.join(dl.xpath('.//dd/p[3]/span/text()').extract()) if response.xpath('.//dd/p[3]/span/text()') else '暂无卖点'
                # 总价
                price = dl.xpath('//dd[@class="price_right"]/span[1]/b/text()').extract_first()
                price_unit = dl.xpath('//dd[@class="price_right"]/span[1]/text()').extract_first()
                item['price'] = price + price_unit
                # 每平米均价
                item['unit'] = dl.xpath(".//dd[@class='price_right']/span[2]/text()").extract_first()
                # 详情页url
                detail_url = dl.xpath(".//h4[@class='clearfix']/a/@href").extract_first()
                item['origin_url'] = response.urljoin(detail_url)

                yield item
        # 下一页
        last_url = response.xpath('//div[@class="page_al"]/p/a[contains(.,"末页")]/@href').extract_first()    # '/house/i3100/'
        #如果某个冷门城市只有一页数据,last_url就不存在,.split('/')出异常
        if last_url:
            last_page = last_url.split('/')[-2].replace('i3','')
            for i in range(1,int(last_page)+1):
                next_url = urljoin(response.url,'/house/i3{page}/'.format(page=i))
                if next_url:
                    yield scrapy.Request(url=next_url,
                                         callback=self.parse_esf,
                                         meta={'info': (province, city_name)}
                                         )
Пример #5
0
    def parse_esf(self, response):
        province, city = response.meta.get("info")
        dls = response.xpath("//div[contains(@class, 'shop_list')]/dl[@dataflag = 'bg']")
        for dl in dls:
            item = ESFHouseItem(province = province, city = city)
            # 房子名字
            name = dl.xpath(".//p[@class = 'add_shop']/a/@title").get()
            item["name"] = name
            # 信息(几室几厅(rooms),面积(area), 层(floor), 朝向(toward), 年代(year))
            infos = dl.xpath(".//p[@class = 'tel_shop']/text()").getall()
            infos = "".join(infos).strip()
            infos = re.sub(r"'|\|\r|\n|/s| ", "", infos)
            item['infos'] = infos
            # 地址
            address = dl.xpath(".//p[@class = 'add_shop']/span/text()").get()
            item['address'] = address
            # 价格
            price = dl.xpath(".//dd[@class = 'price_right']/span[1]//text()").getall()
            price = "".join(price)
            item['price'] = price
            # 均价
            unit = dl.xpath(".//dd[@class = 'price_right']/span[2]/text()").get()
            item['unit'] = unit
            # 原始url
            origin_url = dl.xpath(".//h4[@class = 'clearfix']/a/@href").getall()
            origin_url = "".join(origin_url)
            origin_url = response.urljoin(origin_url)
            item['origin_url'] = origin_url
            yield item

        # 下一页url
        next_url = response.xpath("//div[@class = 'page_al']/p[last()-1]/a/@href").get()
        if next_url:
            yield scrapy.Request(url=response.urljoin(next_url), callback=self.parse_esf, meta={"info": (province, city)})
Пример #6
0
 def parse_esf(self, response):
     # 二手房
     province, city = response.meta.get('info')
     dls = response.xpath("//div[@class='shop_list shop_list_4']/dl")
     for dl in dls:
         item = ESFHouseItem(province=province, city=city)
         name = dl.xpath(".//span[@class='tit_shop']/text()").get()
         if name:
             infos = dl.xpath(".//p[@class='tel_shop']/text()").getall()
             infos = list(map(lambda x: re.sub(r"\s", "", x), infos))
             for info in infos:
                 if "厅" in info:
                     item["rooms"] = info
                 elif '层' in info:
                     item["floor"] = info
                 elif '向' in info:
                     item['toward'] = info
                 elif '㎡' in info:
                     item['area'] = info
                 elif '年建' in info:
                     item['year'] = re.sub("年建", "", info)
             item['address'] = dl.xpath(
                 ".//p[@class='add_shop']/span/text()").get()
             # 总价
             item['price'] = "".join(
                 dl.xpath(".//span[@class='red']//text()").getall())
             # 单价
             item['unit'] = dl.xpath(
                 ".//dd[@class='price_right']/span[2]/text()").get()
             item['name'] = name
             detail = dl.xpath(".//h4[@class='clearfix']/a/@href").get()
             item['origin_url'] = response.urljoin(detail)
             yield item
Пример #7
0
    def parse_esf(self, response):
        # captcha_url = response.css('.image img::attr(src)').get()  # 获取验证码
        # yzm_url = response.urljoin(captcha_url)
        # print(yzm_url)
        # if len(yzm_url) > 0:
        #     province, city = response.meta.get('info')  # 元祖解包
        #     formdata = {
        #         'submit': '提交'
        #     }
        #     code = self.text_captcha(yzm_url)
        #     formdata['code'] = code
        #     print(formdata)
        #     url = response.url
        #     yield scrapy.FormRequest(url=url, callback=self.parse_esf,
        #                     meta={'info':(province, city)}, formdata=formdata)
        # else:
        province, city = response.meta.get('info')  # 元祖解包
        dls = response.xpath('//div[contains(@class,"shop_list")]/dl')
        for dl in dls:
            item = ESFHouseItem(province=province, city=city)
            name = dl.xpath('.//p[@class="add_shop"]/a/text()').get()
            if name == None:
                pass
            else:
                item['name'] = re.sub(r'\s', '', name)
            infos = dl.xpath('.//p[@class="tel_shop"]/text()').getall()
            infos = list(map(lambda x: re.sub(r'\s', '', x), infos))
            for info in infos:
                if '厅' in info:
                    item['rooms'] = info
                elif '层' in info:
                    item['floor'] = info
                elif '向' in info:
                    item['toward'] = info
                elif '建' in info:
                    item['year'] = info
                elif '㎡' in info:
                    item['area'] = info

            item['address'] = dl.xpath(
                './/p[@class="add_shop"]/span/text()').get()
            item['unit'] = dl.xpath(
                './/dd[@class="price_right"]/span[not(@class)]/text()').get()
            item['price'] = "".join(
                dl.xpath(
                    './/dd[@class="price_right"]/span[@class="red"]//text()').
                getall())
            detail_url = dl.xpath('.//h4[@class="clearfix"]/a/@href').get()
            item['origin_url'] = response.urljoin(detail_url)
            yield item
        next_url = response.xpath('//div[@class="page_al"]/p/a/@href').get()
        next_text = response.xpath('//div[@class="page_al"]/p/a/text()').get()
        if next_text == '下一页':
            next_page = response.urljoin(
                next_url)  # 拼接URL urljoin(start_urls, next_page)
            print(next_page)
            yield scrapy.Request(url=next_page,
                                 callback=self.parse_esf,
                                 meta={'info': (province, city)})
Пример #8
0
    def parse_esf(self, response):

        province, city = response.meta.get('info')

        dls = response.xpath("//div[contains(@class,'shop_list')]//dl")
        for dl in dls:
            contain_house_info = dl.xpath(
                ".//p[@class='add_shop']/a/text()").get()
            if contain_house_info:
                item = ESFHouseItem(province=province, city=city)
                name = dl.xpath(
                    ".//p[@class='add_shop']/a/text()").get().strip()

                item['name'] = name
                item['province'] = province
                item['city'] = city

                infos = dl.xpath(".//p[@class='tel_shop']/text()").getall()

                break
                for info in infos:
                    if '厅' in info:
                        item['rooms'] = info.strip()
                    elif '层' in info:
                        item['floor'] = info.strip()
                    elif '向' in info:
                        item['toward'] = info.strip()
                    elif '年' in info:
                        item['year'] = info.replace("建", "")
                    else:
                        item['area'] = info
                item['address'] = dl.xpath(
                    ".//p[@class='add_shop']/span/text()").get()
                price = "".join(
                    dl.xpath(
                        ".//dd[@class='price_right']/span[@class='red']//text()"
                    ).getall())
                item['price'] = price
                item['unit'] = "".join(
                    dl.xpath(
                        ".//dd[@class='price_right']/span[not(@class)]/text()"
                    ).getall())
                detail_url = dl.xpath(".//h4/a/@href").get()

                item['origin_url'] = response.urljoin(detail_url)

                yield item

        next_url = None
        als = response.xpath("//div[@class='page_al']/p//a")
        for al in als:
            if al.xpath(".//text()").get().strip() == '下一页':
                next_url = al.xpath("./@href").get()

        if next_url:
            yield scrapy.Request(url=response.urljoin(next_url),
                                 callback=self.parse_esf,
                                 meta={"info": (province, city)},
                                 dont_filter=True)
Пример #9
0
    def parse_esf(self, response):
        province, city = response.meta.get('info')
        dls = response.xpath("//div[contains(@class,'shop_list')]/dl")
        for dl in dls:
            if dl.xpath(".//p[@class='add_shop']/a/text()").get() == None:
                continue
            name = dl.xpath(".//p[@class='add_shop']/a/text()").get().strip()
            infos = dl.xpath(".//p[@class='tel_shop']/text()").getall()
            infos = list(map(lambda x: re.sub(r"\s", "", x), infos))
            rooms = ''
            floor = ''
            toward = ''
            year = ''
            area = ''
            for info in infos:
                if "厅" in info or "拼" in info or "栋" in info or "排" in info:
                    rooms = info
                elif "层" in info:
                    floor = info
                elif "向" in info:
                    toward = info
                elif "年" in info:
                    year = info.replace("年建", "")
                elif "㎡" in info:
                    area = info
            address = dl.xpath(
                ".//p[@class='add_shop']/span/text()").get().strip()
            price = "".join(
                dl.xpath(".//dd[@class='price_right']/span[1]//text()").getall(
                )).strip()
            unit = "".join(
                dl.xpath(".//dd[@class='price_right']/span[2]//text()").get().
                strip())
            detail_url = dl.xpath(".//h4/a/@href").get()
            origin_url = response.urljoin(detail_url)

            item = ESFHouseItem(province=province,
                                city=city,
                                name=name,
                                rooms=rooms,
                                floor=floor,
                                toward=toward,
                                year=year,
                                address=address,
                                area=area,
                                price=price,
                                unit=unit,
                                origin_url=origin_url)
            yield item

        next_url = response.xpath(
            "//div[@class='page_al']/p[last()-2]/a/@href").get().strip()
        next_text = response.xpath(
            "//div[@class='page_al']/p[last()-2]/a/text()").get().strip()
        if next_url and "下一页" in next_text:
            yield scrapy.Request(url=response.urljoin(next_url),
                                 callback=self.parse_esf,
                                 meta={"info": (province, city)})
Пример #10
0
    def parse_esf(self, response):
        province, city = response.meta.get('info')
        dls = response.xpath("//div[contains(@class,'shop_list')]/dl")
        for dl in dls:
            item = ESFHouseItem(province=province, city=city)
            name = dl.xpath(".//p[@class='add_shop']/a/text()").get()
            if name:
                name = name.strip()
            item['name'] = name
            infos = dl.xpath(".//p[@class='tel_shop']/text()").getall()
            infos = list(map(lambda x: re.sub(r"\s", "", x), infos))
            for info in infos:
                if "厅" in info:
                    item['rooms'] = info
                elif "层" in info:
                    item['floor'] = info
                elif "向" in info:
                    item['toward'] = info
                elif "㎡" in info:
                    item['area'] = info
                elif "年" in info:
                    item['year'] = info
            address = dl.xpath(".//p[@class='add_shop']/span/text()").get()
            item['address'] = address
            price = dl.xpath(
                ".//dd[@class='price_right']/span[@class='red']//text()"
            ).getall()
            price = "".join(list(map(lambda x: re.sub(r"\s", "", x), price)))
            unit = dl.xpath(".//dd[@class='price_right']/span[2]/text()").get()
            item['price'] = price
            item['unit'] = unit
            suffix_url = dl.xpath(".//h4[@class='clearfix']/a/@href").get()
            item['origin_url'] = response.urljoin(suffix_url)
            try:
                item['year']
            except:
                item['year'] = ''
            try:
                item['rooms']
            except:
                item['rooms'] = ''
            try:
                item['floor']
            except:
                item['floor'] = ''
            try:
                item['toward']
            except:
                item['toward'] = ''

            yield item

        next_url = response.xpath(
            "//div[@class='page_box']/p[3]/a/@href").get()
        if next_url:
            yield scrapy.Request(url=response.urljoin(next_url),
                                 callback=self.parse_esf,
                                 meta={'info': (province, city)})
Пример #11
0
    def parse_esf(self, response):
        province, city = response.meta.get('info')
        dls = response.xpath('//div[contains(@class,"hop_list")]/dl')
        for dl in dls:
            name = dl.xpath(".//p[@class='add_shop']/a/text()").get()
            if not name:
                continue
            item = ESFHouseItem(province=province, city=city)
            item['name'] = name.strip()
            # print(name)
            infos = dl.xpath(".//p[@class='tel_shop']/text()").getall()
            infos = list(map(lambda x: re.sub(r'\s', "", x), infos))
            # print(infos)
            for info in infos:
                if "厅" in info:
                    item['rooms'] = info
                elif "层" in info:
                    item['floor'] = info
                elif "向" in info:
                    item['toward'] = info
                elif "年建" in info:
                    item['year'] = info.replace("年建", "")
                elif "㎡" in info:
                    item['area'] = info
                else:
                    pass
            # print(item)
            item['address'] = dl.xpath(
                ".//p[@class='add_shop']/span/text()").get()
            # print(address)
            item['price'] = "".join(
                dl.xpath(
                    ".//dd[@class='price_right']/span[1]//text()").getall())
            item['unit'] = dl.xpath(
                ".//dd[@class='price_right']/span[2]/text()").get()
            detail_url = dl.xpath('.//h4[@class="clearfix"]/a/@href').get()
            item['origin_url'] = response.urljoin(detail_url)
            # print(item)
            yield item

        next_url = response.xpath(
            "//div[@id='list_D10_15']/p[3]/a/@href").get()
        # print(next_url)

        if not next_url:
            url = response.xpath("//div[@id='list_D10_15']/p[1]/a/@href").get()
            if not url == '/house/':
                next_url = url
        # print(next_url)
        # print('=='*30)
        if next_url:
            yield scrapy.Request(url=response.urljoin(next_url),
                                 callback=self.parse_esf,
                                 meta={"info": (province, city)})
Пример #12
0
 def parse_esf(self, response):
     province, city = response.meta.get("info")
     dls = response.xpath("//div[contains(@class,'shop_list')]/dl")
     item = ESFHouseItem()
     for dl in dls:
         item = ESFHouseItem(province=province, city=city)
         name = dl.xpath(".//p[@class='add_shop']/a/text()").get()
         if name is not None:
             name = name.strip()
         item['name'] = name
         infos = dl.xpath(".//p[@class='tel_shop']/text()").getall()
         infos = list(map(lambda x: re.sub(r'\s', "", x), infos))
         for info in infos:
             if "厅" in info:
                 item['rooms'] = info
             elif '层' in info:
                 item['floor'] = info
             elif '向' in info:
                 item['toward'] = info
             elif '年' in info:
                 item['year'] = info.replace('年建', "")
             elif '㎡' in info:
                 item["area"] = info
             # print(item)
         address = dl.xpath(".//p[@class='add_shop']/span/text()").get()
         item['address'] = address
         item['price'] = "".join(
             dl.xpath(
                 ".//dd[@class='price_right']/span[1]//text()").getall())
         item['unit'] = "".join(
             dl.xpath(
                 ".//dd[@class='price_right']/span[2]//text()").getall())
         item['origin_url'] = response.urljoin(
             (dl.xpath(".//h4[@class='clearfix']/a/@href").get()))
         # print(item['origin_url'])
         yield item
     next_url = response.xpath("//div[@class='page_al']/p[1]/a/@href").get()
     next_url = response.urljoin(next_url)
     yield scrapy.Request(url=next_url,
                          callback=self.parse_esf,
                          meta={'info': (province, city)})
Пример #13
0
    def parse_esf(self, response):
        province, city = response.meta.get("info")
        print(province + "   " + city)
        dls = response.xpath("//div[@class='shop_list shop_list_4']/dl")
        for dl in dls:
            name = dl.xpath(".//p[@class='add_shop']/a/@title").get()
            if name == None:
                continue
            address = dl.xpath(".//p[@class='add_shop']/span/text()").get()
            shops = "".join(
                dl.xpath(".//p[@class='tel_shop']//text()").getall())
            shops = re.sub(r"\s", "", shops)
            shops = shops.split("|")
            toward = None
            rooms = shops[0]
            area = shops[1]
            floor = shops[2]
            if len(shops) > 5:
                toward = shops[3]
                year = shops[4]
            else:
                year = shops[3]

            url = response.urljoin(
                dl.xpath(".//h4[@class='clearfix']/a/@href").get())
            price = "".join(
                dl.xpath(
                    ".//dd[@class='price_right']/span[@class='red']//text()").
                getall())
            unit = dl.xpath(
                ".//dd[@class='price_right']/span[not(@class='red')]//text()"
            ).get().strip()
            item = ESFHouseItem(name=name,
                                address=address,
                                toward=toward,
                                rooms=rooms,
                                area=area,
                                floor=floor,
                                year=year,
                                url=url,
                                price=price,
                                unit=unit,
                                province=province,
                                city=city)
            yield item
        next_url = response.xpath(
            "//div[@class='page_al']/p[last()-2]/a/@href").get()
        if next_url:
            yield scrapy.Request(url=response.urljoin(next_url),
                                 callback=self.parse_esf,
                                 meta={"info": (province, city)})
Пример #14
0
 def parse_esf(self, response):
     print(response.url)
     provinces, city = response.meta.get("info")
     item = ESFHouseItem(provinces=provinces, city=city)
     #获取所有的dls
     dls = response.xpath('//div[contains(@class,"shop_list")]/dl')
     for dl in dls:
         item["name"] = dl.xpath('.//p[@class="add_shop"]/a/@title').get()
         infos = dl.xpath('.//p[@class="tel_shop"]/text()').getall()
         infos = list(map(lambda x: re.sub(r"\s", "", x), infos))
         for info in infos:
             print(info)
             if '室' in info:
                 item["rooms"] = info
             elif '层' in info:
                 item["floor"] = info
             elif '向' in info:
                 item["toward"] = info
             elif '㎡' in info:
                 item['area'] = info
             else:
                 item["year"] = info.replace("建筑年代", "")
         #地址
         item['address'] = dl.xpath(
             './/p[@class="add_shop"]/span/text()').get()
         #总价格
         price_s = dl.xpath(
             './/dd[@class="price_right"]/span/b/text()').get()
         price_w = dl.xpath(
             './/dd[@class="price_right"]/span[1]/text()').get()
         if price_s and price_w:
             item['price'] = ''.join(price_s) + ''.join(price_w)
         else:
             item['price'] = ' '
         #
         #多少一平米
         item['unit'] = dl.xpath(
             './/dd[@class="price_right"]/span[2]/text()').get()
         # origin_url
         item['origin_url'] = response.urljoin(
             dl.xpath('.//h4/a/@href').get())
         print(item, response.url, city)
         yield item
     next_url = response.xpath('//div[@class="page_al"]/p[1]/a/@href').get()
     if next_url:
         yield scrapy.Request(url=response.urljoin(next_url),
                              callback=self.parse_esf,
                              meta={"info": (provinces, city)})
Пример #15
0
    def parse_esf(self, response):
        province, city = response.meta.get('info')
        dls = response.xpath("//div[@class='shop_list shop_list_4']/dl")

        for dl in dls:
            item = {}

            infos = dl.xpath(".//p[@class='tel_shop']/text()").getall()
            infos = list(map(lambda x: re.sub(r"\s", "", x), infos))
            if infos is not None:
                name = dl.xpath(".//span[@class='tit_shop']/text()").get()
                address = dl.xpath(".//p[@class='add_shop']/span/text()").get()
                price = dl.xpath(".//dd[@class='price_right']/span[@class='red']//text()").getall()
                price = re.sub("\s", "", "".join(price))
                unit = dl.xpath(".//dd[@class='price_right']/span[last()]/text()").get()
                unit = re.sub("\s", "", unit)
                origin_url = dl.xpath(".//h4[@class='clearfix']/a/@href").get()
                origin_url = response.url[:-2] + origin_url

                item['rooms'] = infos[0]
                item['floor'] = infos[2]
                item['area'] = infos[1]
                item['toward'] = infos[3]
                try:
                    year = infos[4]
                    item['year'] = re.sub("年建", "", year)
                except:
                    item['year'] = ''

                item['address'] = address
                item['price'] = price
                try:
                    item['unit'] = unit

                except:
                    item['unit'] = ''
                item['unit'] = unit
                item['origin_url'] = origin_url
                item['province'] = province
                item['city'] = city
                item['name'] = name
                item = ESFHouseItem(**item)
                yield item
                next_url = response.xpath("//div[@class='page_al']/p/a/@href").get()
                next_url = response.url[:-2] + next_url
                if next_url:
                    yield scrapy.Request(url=response.urljoin(next_url), callback=self.parse_esf,
                                         meta={"info": (province, city)})
Пример #16
0
    def parse_esf(self, response):
        province, city = response.meta.get('info')
        dls = response.xpath('//div[@class="shop_list shop_list_4"]/dl')
        for dl in dls:
            item = ESFHouseItem()
            item['province'] = province
            item['city'] = city

            item['name'] = dl.xpath('.//p[@class="add_shop"]/a/@title').get()

            infos = dl.xpath('.//p[@class="tel_shop"]/text()').getall()
            infos = list(map(lambda x: re.sub(r'\s', '', x), infos))

            for info in infos:
                if '厅' in info:
                    item['rooms'] = info
                elif '层' in info:
                    item['floor'] = info
                elif '向' in info:
                    item['toward'] = info
                elif '年' in info:
                    item['year'] = info
                elif '㎡' in info:
                    item['area'] = info

            item['address'] = dl.xpath(
                './/p[@class="add_shop"]/span/text()').get()
            # 总价
            item['price'] = ''.join(
                dl.xpath(
                    './/dd[@class="price_right"]/span[1]//text()').getall())
            # 单价
            item['unit'] = ''.join(
                dl.xpath(
                    './/dd[@class="price_right"]/span[2]//text()').getall())

            detail_url = dl.xpath('.//h4[@class="clearfix"]/a/@href').get()
            item['origin_url'] = response.urljoin(detail_url)
            yield item
            print(item)
            print('==' * 40)

        next_url = response.xpath(
            '//div[@id="list_D10_15"]/p[1]/a/@href').get()
        if next_url:
            yield scrapy.Request(url=response.urljoin(next_url),
                                 callback=self.parse_esf,
                                 meta={'info': (province, city)})
Пример #17
0
    def parse_esf(self,response):
        province, city = response.meta.get("info")
        print("esf response:",response.url)
        dl_list = response.xpath("//div[contains(@class,'shop_list')]//dl[@class='clearfix']")
        for dl in dl_list:
            if dl is not None:
                item = ESFHouseItem(province=province,city=city)
                item['name'] =dl.xpath(".//p[@class='add_shop']/a/@title").get()
                infos = dl.xpath(".//p[@class='tel_shop']/text()").getall()
                infos = list(map(lambda x:re.sub(r"\s","",x),infos))
                item['rooms'] = None
                item['floor'] = None
                item['toward'] = None
                item['area'] = None
                item['year'] = None
                item['origin_url'] = None
                if infos:
                    for info in infos:
                        if '厅' in info:
                            item['rooms']= info
                        elif '层' in info:
                            item['floor'] = info
                        elif '向' in info:
                            item['toward'] = info
                        elif '㎡' in info:
                            item['area'] = info
                        elif '年' in info:
                            item['year'] = info

                item["address"] = dl.xpath(".//p[@class='add_shop']/span/text()").get()
                item["price"] = "".join(dl.xpath(".//dd[@class='price_right']/span[1]//text()").getall())
                item["unit"] = "".join(dl.xpath(".//dd[@class='price_right']/span[2]/text()").getall())
                detail = dl.xpath(".//h4[@class='clearfix']/a/@href").get()
                if detail is not None:
                    item['origin_url'] =response.urljoin(detail)
                    #print(item['origin_url'] )
                    #item['origin_url'] = dl.xpath("./dt/a/@href").get()
                yield item
        #下一页
        next_url = response.xpath("//a[text()='下一页']/@href").get()
        next_url = response.urljoin(next_url)
        if next_url:
            yield scrapy.Request(
                url=next_url,
                callback=self.parse_newhourse,
                meta={"info": (province, city)}
            )
Пример #18
0
    def parse_esf(self, response):
        province, city = response.meta.get('info')

        #print(name)
        dls = response.xpath("//dl[contains(@dataflag,'bg')]")
        for dl in dls:
            item = ESFHouseItem(province=province, city=city)
            name = ''.join(
                dl.xpath(".//dd//p[@class='add_shop']/a/@title").getall())
            name = re.sub(r"\s", "", name)
            item['name'] = name
            infos = dl.xpath(".//dd//p[@class='tel_shop']//text()").getall()
            infos = list(map(lambda x: re.sub(r"\s|\|", '', x), infos))
            infos = list(filter(None, infos))
            for info in infos:
                if "厅" in info:
                    item['rooms'] = info
                elif '层' in info:
                    item['floor'] = info
                elif '年' in info:
                    item['year'] = info
                elif '向' in info:
                    item['toward'] = info
                elif '㎡' in info:
                    item['area'] = info
            address = "".join(
                dl.xpath(".//dd//p[@class='add_shop']//span//text()").getall())
            item['address'] = address
            price = "".join(
                dl.xpath(
                    ".//dd[@class='price_right']//span[@class='red']//text()").
                getall())
            item['price'] = price
            unit = "".join(
                dl.xpath(
                    ".//dd[@class='price_right']//span[2]//text()").getall())
            item['unit'] = unit
            detail_url = "".join(
                dl.xpath(".//h4[@class='clearfix']/a/@href").getall())
            item['origin_url'] = response.urljoin(detail_url)
            yield item
        next_url = response.xpath(
            "//div[@class='page_al']//p[1]/a/@href").get()
        yield scrapy.Request(url=response.urljoin(next_url),
                             callback=self.parse_esf,
                             meta={"info": {province, city}})
Пример #19
0
 def parse_esf(self, response):
     province, city = response.meta.get("info")
     dls = response.xpath("//div[contains(@class, 'shop_list')]/dl")
     for dl in dls:
         item = ESFHouseItem(province=province, city=city)
         name = dl.xpath(".//span[@class='tit_shop']/text()").get()
         if not name:
             continue
         infos = "".join(
             dl.xpath(".//p[@class='tel_shop']//text()").getall()).strip()
         infos = infos.split("|")
         if infos[0] == "独栋":
             item['rooms'] = infos[1].strip() + "[别墅]"
             item['area'] = infos[3].strip()
             item['floor'] = infos[2].strip()
             item['toward'] = infos[4].strip()
         else:
             item['rooms'] = infos[0].strip()
             item['area'] = infos[1].strip()
             item['floor'] = infos[2].strip()
             item['toward'] = infos[3].strip()
             try:
                 item['year'] = infos[4].strip()
             except IndexError:
                 print("没有年份记录")
         address = dl.xpath(".//p[@class='add_shop']/span/text()").get()
         item['address'] = address
         price_text = "".join(
             dl.xpath(
                 ".//dd[@class='price_right']//text()").getall()).strip()
         price_text = re.sub(r'\s', '', price_text)
         price = price_text.split("万")[0].strip() + "万"
         unit = price_text.split("万")[1].strip()
         item['price'] = price
         item['unit'] = unit
         origin_url_text = dl.xpath(
             ".//h4[@class='clearfix']/a/@href").get()
         origin_url = response.urljoin(origin_url_text)
         item['origin_url'] = origin_url
         yield item
     next_url = response.xpath(
         ".//div[@class='page_al']/p[1]/a/@href").get()
     if next_url:
         yield scrapy.Request(url=response.urljoin(next_url),
                              callback=self.parse_esf,
                              meta={"info": (province, city)})
Пример #20
0
 def parse_esf(self,response):
     province,city = response.meta.get("info")
     dls = response.xpath("//div[@class='shop_list shop_list_4']/dl")
     for dl in dls:
         item = ESFHouseItem(province=province,city=city)
         item['name'] = dl.xpath(".//p[@class='add_shop']/a/@title").getall()
         infos = dl.xpath(".//p[@class='tel_shop']/text()").getall()
         infos = list(map(lambda x:re.sub(r"\s","",x),infos))
         for info in infos:
             try:
                 if "厅" in info:
                     item['rooms'] = info
                 elif "卧室" in info:
                     item['rooms'] = info
                 elif "㎡" in info:
                     item['area'] = info
                 elif "层" in info:
                     item['floor'] = info
                 elif "叠加" in info:
                     item['floor'] = info
                 elif "双拼" in info:
                     item['floor'] = info
                 elif "独栋" in info:
                     item['floor'] = info
                 elif '向' in info:
                     item['toward'] = info
                 elif '年建' in info:
                     item['year'] = info.replace('年建',"")
             except:
                 pass
             #print(item)
         item['address'] = dl.xpath(".//p[@class='add_shop']/span/text()").get()
         item['price'] = dl.xpath(".//dd[@class='price_right']/span/b/text()").get()
         try:
             unit = dl.xpath(".//dd[@class='price_right']//span/text()").getall()
             item['unit'] = unit[1]
         except:
             pass
         origin_url = response.urljoin(dl.xpath(".//h4[@class='clearfix']/a/@href").get())
         item['origin_url'] = origin_url
     #print(item)
         yield item
     next_url = response.urljoin(response.xpath("//div[@class='page_al']/p/a/@href").get())
     yield scrapy.Request(url=next_url,callback=self.parse_esf,meta={"info":(province,city)})
Пример #21
0
    def parse_esf(self, response):
        province, city = response.meta.get("info")

        dls = response.xpath("//dl[@class='clearfix']")
        for dl in dls:
            item = ESFHouseItem(province=province, city=city)
            name = dl.xpath(".//p[@class='add_shop']/a/text()").get()
            if name is not None:
                name = name.strip()
            item['name'] = name
            infos = dl.xpath(".//p[@class='tel_shop']/text()").getall()
            infos = list(map(lambda x: re.sub(r"\s", "", x), infos))
            for info in infos:
                if "厅" in info:
                    item['rooms'] = info
                elif "层" in info:
                    item['floor'] = info
                elif "向" in info:
                    item['toward'] = info
                elif "年建" in info:
                    item['year'] = info.replace("年建", "")
                elif "㎡" in info:
                    item['area'] = info
                # print(item)
            # print(infos)
            address = dl.xpath(".//p[@class='add_shop']//span/text()").get()
            item['address'] = address

            price = "".join(
                dl.xpath(".//dd[@class='price_right']/span//text()").getall()
                [:2])
            unit = dl.xpath(
                ".//dd[@class='price_right']/span[2]//text()").get()
            item['price'] = price
            item['unit'] = unit
            origin_url = response.urljoin(
                dl.xpath(".//h4[@class='clearfix']/a/@href").get())
            item['origin_url'] = origin_url
            yield item
        next_url = response.xpath("//div[@class='page_al']/p[2]/a/@href").get()
        yield scrapy.Request(url=response.urljoin(next_url),
                             callback=self.parse_esf,
                             meta={"info": (province, city)})
Пример #22
0
 def parse_esf(self,response):
     province,city = response.meta.get('info')
     dls = response.xpath("//div[@class='shop_list shop_list_4']/dl")
     for dl in dls:
         item = ESFHouseItem(province = province,city = city)
         item['name'] = dl.xpath(".//p[contains(@class,'add_shop')]/a/text()").get()
         if item['name'] is not None:
           item['name'] = item['name'].strip()
         infos = dl.xpath(".//p[@class='tel_shop']//text()").getall()
         infos = list(map(lambda x:re.sub(r"\s|","",x),infos))
         #print(infos)
         for info in  infos:
             if "厅" in info:
                 item['rooms'] = info
             elif "㎡" in info:
                 item['area'] = info
             elif "层" in info:
                 item['floor'] = info
             elif "向" in info:
                 item['toward'] = info
             elif "建" in info:
                 item['year'] = info
                 #print(item)
         item['address'] = dl.xpath(".//p[@class='add_shop']/span/text()").get()
         if item['address'] is not None:
             item['address'] = item['address']
         item['price'] ="".join(dl.xpath(".//span[contains(@class,'red')]//text()").getall())
         if item['price'] is not "":
            item['price'] = re.sub(r"\s|热搜","",item['price'])
            #print(item['price'])
         item['unit'] = dl.xpath(".//dd[contains(@class,'price_right')]/span[2]/text()").get()
         if item['unit'] is not None:
            item['unit'] = item['unit']
            #print(item['unit'])
         ori_url = dl.xpath(".//h4[@class='clearfix']/a/@href").get()
         if ori_url is not None:
            item['origin_url'] = response.urljoin(ori_url)
            #print(item['origin_url'])
         #print(item)
         yield item
     next_url = response.xpath("//div[@class='page_al']/p[1]/a/@href").get()
     yield scrapy.Request(url=response.urljoin(next_url),callback=self.parse_esf,meta={"info":(province,city)})
Пример #23
0
    def parse_esf(self, response):

        province, city, cityabbr = response.meta.get("info")
        temp_url = "https://" + cityabbr + ".esf.fang.com"
        dls = response.xpath("//div[contains(@class,'shop_list')]/dl")

        for dl in dls:
            item = ESFHouseItem(province=province, city=city)
            item['name'] = dl.xpath(".//p[@class='add_shop']/a/@title").get()
            item['address'] = dl.xpath(
                ".//p[@class='add_shop']/span/text()").get()
            item['price'] = "".join(
                dl.xpath(
                    ".//dd[@class='price_right']/span[1]//text()").getall())
            item['unit'] = dl.xpath(
                ".//dd[@class='price_right']/span[2]//text()").get()
            origin_url = dl.xpath(".//h4/a/@href").get()

            item["origin_url"] = urljoin(temp_url, origin_url)
            infos = dl.xpath(".//p[@class='tel_shop']/text()").getall()
            infos = list(map(lambda x: re.sub(r"\s", "", x), infos))
            # print(name)
            # print(address)
            # print(infos)
            for info in infos:
                if "厅" in info:
                    item['rooms'] = info
                elif "层" in info:
                    item["floor"] = info
                elif '向' in info:
                    item["toward"] = info
                elif '建' in info:
                    item["year"] = info.replace("年建", "")
                elif '㎡' in info:
                    item["area"] = info
                # print(item)

        next_url = response.xpath("//a[text()='下一页']/@href").get()
        yield scrapy.Request(url=response.urljoin(temp_url, next_url),
                             callback=self.parse_esf,
                             meta={"info": (province, city, cityabbr)})
Пример #24
0
 def parse_esf_house(self, response):
     province, city = response.meta.get('info')
     dls = response.xpath('//div[@class="houseList"]/dl')
     for dl in dls:
         name = dl.xpath('.//p[@class="title"]/a/@title').get()
         describe = dl.xpath('.//p[@class="mt12"]/text()').getall()
         try:
             rooms = describe[0].strip()
         except Exception:
             rooms = ""
         try:
             floor = describe[1].strip()
         except Exception:
             floor = ""
         try:
             toward = describe[2].strip()
         except Exception:
             toward = ""
         try:
             year = describe[3].strip().split(":")[1]
         except Exception:
             year = ""
         address = dl.xpath('.//p[@class="mt10"]/span/@title').get()
         area = dl.xpath('.//div[contains(@class,"area")]/p/text()').get()
         price = "".join(
             dl.xpath('.//div[@class="moreInfo"]/p/span/text()').getall()
             [0:2])
         unit = "".join(
             dl.xpath(
                 './/div[@class="moreInfo"]/p[last()]//text()').getall())
         origin_url = response.url
         item = ESFHouseItem(province=province,city=city,name=name,rooms=rooms,floor=floor,toward=toward,\
                             year=year,address=address,area=area,price=price,unit=unit,origin_url=origin_url)
         yield item
     next_page = response.xpath(
         '//a[@id="PageControl1_hlk_next"]/@href').get()
     if next_page:
         yield scrapy.Request(url=response.urljoin(next_page),
                              callback=self.parse_esf_house,
                              meta={"info": (province, city)})
Пример #25
0
 def parse_esfhoust(self, response):
     province, city = response.meta.get("info")
     item = ESFHouseItem(province=province, city=city)
     dls = response.xpath("//div[@class='shop_list shop_list_4']/dl")
     for dl in dls:
         item['name'] = dl.xpath(".//p[@class='add_shop']/a/@title").get()
         infos = dl.xpath(".//p[@class='tel_shop']/text()").getall()
         infos = list(map(lambda x: re.sub(r"\s", "", x), infos))
         for info in infos:
             if "厅" in info:
                 item['rooms'] = info
             elif "层" in info:
                 item["floor"] = info
             elif "向" in info:
                 item['toward'] = info
             elif "年" in info:
                 item['year'] = re.sub(r"建", "", info)
                 # item['year']=info.replace("建","")#这样也可以
             elif "㎡" in info:
                 item['area'] = info
         item['address'] = dl.xpath(
             ".//p[@class='add_shop']/span/text()").get()
         item['price'] = "".join(
             dl.xpath(
                 ".//dd[@class='price_right']/span[@class='red']//text()").
             getall())
         item['unit'] = "".join(
             dl.xpath(
                 ".//dd[@class='price_right']/span[2]/text()").getall())
         origin_url = dl.xpath(".//h4[@class='clearfix']/a/@href").get()
         item['origin_url'] = response.urljoin(origin_url)
         print(item['origin_url'])
         yield item
     next_url = response.xpath("//div[@class='page_al']/p[1]/a/@href").get()
     next_url = response.urljoin(next_url)
     if next_url:
         yield scrapy.Request(url=next_url,
                              callback=self.parse_esfhoust,
                              meta={"info": (province, city)})
Пример #26
0
    def parse_esf(self, response):
        province, city = response.meta.get("info")
        dls = response.xpath("//div[@class='shop_list shop_list_4']/dl")
        for dl in dls:
            item = ESFHouseItem(province=province, city=city)
            item['name'] = dl.xpath("//p[@class='add_shop']/a/text()").get()
            infos = dl.xpath("//p[@class='tel_shop']/text()").getall()
            infos = list(map(lambda x: re.sub(r"\s", "", x), infos))
            for info in infos:
                if "厅" in info:
                    item['rooms'] = info
                elif '层' in info:
                    item['rooms'] = info
                elif '向' in info:
                    item['toward'] = info
                elif '㎡' in info:
                    item['area'] = info
                else:
                    item['year'] = info.replace("建筑年代:", "")

            item['address'] = dl.xpath(
                ".//p[@class='add_shop']/span/text()").get()
            item['price'] = dl.xpath(
                ".//dd[@class='price_right']/span[@class='red']//text()"
            ).getall()
            # 等价于
            # item['price'] = dl.xpath(".//dd[@class='price_right']/span[1]//text()").getall()
            item['unit'] = dl.xpath(
                ".//dd[@class='price_right']/span[last()]/text()").getall()
            item['origin_url'] = response.urljoin(
                dl.xpath(".//dd/h4[@class='clearfix']/a/@href").get())
            yield item
        next_url = response.xpath(
            "//div[@id='list_D10_15']/p[1]/a/@href").get()
        yield scrapy.Request(url=response.urljoin(next_url),
                             callback=self.parse_esf,
                             meta={"info": (property, city)})
Пример #27
0
    def parse_esf(self, response):
        province, city = response.meta.get('info')

        dls = response.xpath("//div[@class='shop_list shop_list_4']/dl")
        for dl in dls:
            item = ESFHouseItem(province=province, city=city)
            item['name'] = dl.xpath(".//p[@class='add_shop']/a/text()").get()
            infos = dl.xpath(".//p[@class='tel_shop']/text()").getall()
            infos = list(map(lambda x: re.sub(r'\s', '', x), infos))
            for info in infos:
                if '厅' in info:
                    item['rooms'] = info
                elif '层' in info:
                    item['floor'] = info
                elif '向' in info:
                    item['toward'] = info
                elif '㎡' in info:
                    item['area'] = info
                else:
                    item['year'] = info.replace('年建', '')
            item['address'] = dl.xpath(
                ".//p[@class='add_shop']/span/text()").get()
            item['price'] = "".join(
                dl.xpath(
                    ".//dd[@class='price_right']/span[1]//text()").getall())
            item['unit'] = dl.xpath(
                ".//dd[@class='price_right']/span[2]//text()").get()
            detail_url = dl.xpath(".//dd/h4/a/@href").get()
            item['origin_url'] = response.urljoin(detail_url)
            yield item

        next_url = response.xpath("//div[@class='page_al']/p/a/@href").get()
        if next_url:
            yield scrapy.Request(url=response.urljoin(next_url),
                                 callback=self.parse_esf,
                                 meta={"info": (province, city)})
Пример #28
0
    def parse_esf(self, response):

        # 获取省份和城市
        province, city = response.meta.get('info')

        dls = response.xpath("//div[@class='shop_list shop_list_4']/dl")
        for dl in dls:
            item = ESFHouseItem(province=province, city=city)
            # 获取小区名字
            name = dl.xpath(".//p[@class='add_shop']/a/text()").get()
            if name == None:
                pass
            else:
                item['name'] = name.strip()
                # print(name)

            # 获取综合信息
            infos = dl.xpath(".//p[@class='tel_shop']/text()").getall()
            if len(infos) == 0:
                pass
            else:
                infos = list(map(lambda x: re.sub(r"\s", "", x), infos))
                # print(infos)
                for info in infos:
                    if "厅" in info:
                        item['rooms'] = info
                    elif '层' in info:
                        item['floor'] = info
                    elif '向' in info:
                        item['toward'] = info
                    elif '年' in info:
                        item['year'] = info
                    elif '㎡' in info:
                        item['area'] = info
                    # print(item)

            # 获取地址
            address = dl.xpath(".//p[@class='add_shop']/span/text()").get()
            if address == None:
                pass
            else:
                # print(address)
                item['address'] = address

            # 获取总价
            price = dl.xpath(
                "./dd[@class='price_right']/span[1]/b/text()").getall()
            if len(price) == 0:
                pass
            else:
                price = "".join(price)
                # print(price)
                item['price'] = price

            # 获取单价
            unit = dl.xpath("./dd[@class='price_right']/span[2]/text()").get()
            if unit == None:
                pass
            else:
                # print(unit)
                item['unit'] = unit

            # 获取初始url
            detail_url = dl.xpath(".//h4[@class='clearfix']/a/@href").get()
            if detail_url == None:
                pass
            else:
                origin_url = response.urljoin(detail_url)
                # print(origin_url)
                item['origin_url'] = origin_url
            # print(item)
            yield item
        next_url = response.xpath(".//div[@class='page_al']/p/a/@href").get()
        # print(next_url)
        yield scrapy.Request(url=response.urljoin(next_url),
                             callback=self.parse_esf,
                             meta={"info": (province, city)})