Exemplo n.º 1
0
    def parse_list(self, response):
        print('parse_list response.url:' + response.url)
        self.logger.debug('parse_list response.url:' + response.url)

        item = LianjiaNewItem()

        items = response.css(
            'ul.resblock-list-wrapper > li[data-project-name]')
        for i in items:
            item['title'] = i.css('.resblock-name a::text').extract_first()
            item['url'] = 'https://sz.fang.lianjia.com' + i.css(
                '.resblock-name a::attr(href)').extract_first()
            item['type'] = i.css('.resblock-type::text').extract_first()
            item['status'] = i.css('.sale-status::text').extract_first()
            item['total_price'] = i.css('.second::text').extract_first()
            item['unit_price'] = i.css('.number::text').extract_first()
            item['img'] = i.css(
                '.lj-lazy::attr(data-original)').extract_first()
            item['area'] = i.css('.resblock-area span::text').extract_first()
            item['location'] = i.css(
                'div.resblock-location > span:nth-child(1)::text'
            ).extract_first()
            item['community'] = i.css(
                'div.resblock-location > span:nth-child(3)::text'
            ).extract_first()
            item['address'] = i.css(
                'div.resblock-location > a::text').extract_first()
            getlocation(item)
            item['number'] = item['url'].split('/')[-2].split('_')[-1]
            yield item
Exemplo n.º 2
0
    def parse_list(self, response):
        print('parse_list response.url:' + response.url)
        self.logger.debug('parse_list response.url:' + response.url)

        item = TongchengShoprentalItem()

        li = response.css('.house-list-wrap>li')
        for i in li:
            item['title'] = i.css('.title_des::text').extract_first().strip()
            item['number'] = i.xpath('@logr').extract_first().split('_')[3]
            item['url'] = 'http://sz.58.com/shangpu/' + item[
                'number'] + 'x.shtml'
            item['month_price'] = i.css('p.sum > b::text').extract_first()
            item['day_price'] = i.css('.unit span::text').extract_first()
            # item['time'] = i.css('.time::text').extract_first()
            item['img'] = i.css('img::attr(data-src)').extract_first()
            # print('area : {}'.format(i.css('div.list-info > p:nth-child(2) > span:nth-child(1)::text').re_first('\d+(\.\d+)?')))
            # self.logger.debug('area : {}'.format(i.css('div.list-info > p:nth-child(2) > span:nth-child(1)::text').re_first('\d+(\.\d+)?')))
            # item['area'] = float(
            #     i.css('div.list-info > p:nth-child(2) > span:nth-child(1)::text').re_first(r'\d+(\.\d+)?'))
            con = i.css(
                'div.list-info > p:nth-child(2) > span::text').extract()
            if len(con) == 3:
                item['area'] = con[0].strip()
                item['type'] = con[1].strip()
                item['status'] = con[2].strip()
            elif len(con) == 2:
                item['area'] = con[0].strip()
                item['status'] = con[1].strip()
            # item['area'] = i.css('div.list-info > p:nth-child(2) > span:nth-child(1)::text').extract_first()
            # item['type'] = i.css('div.list-info > p:nth-child(2) > span:nth-child(2)::text').extract_first()
            # item['status'] = i.css('div.list-info > p:nth-child(2) > span:nth-child(3)::text').extract_first()
            loc = i.css(
                'div.list-info > p:nth-child(3) > span:nth-child(1)::text'
            ).extract_first().split('-')
            item['district'] = loc[0].strip()
            if len(loc) > 1:
                item['location'] = loc[1].strip()
            else:
                item['district'] = ''
            if i.css('div.list-info > p:nth-child(3) > span:nth-child(2)::text'
                     ):
                item['address'] = i.css(
                    'div.list-info > p:nth-child(3) > span:nth-child(2)::text'
                ).extract_first().replace('-', '')
            else:
                item['address'] = ''
            item['tags'] = ' '.join(
                i.css('div.list-info > p.tag-wrap > span::text').extract())
            getlocation(item)
            yield item

        le = LinkExtractor(restrict_css='div.pager > a.next')
        print('5' * 200)
        links = le.extract_links(response)
        if links:
            next_url = links[0].url
            print('next_url:', next_url)
            self.logger.debug('next_url:' + next_url)
            yield Request(next_url, callback=self.parse_list)
Exemplo n.º 3
0
    def parse_list(self, response):
        print('parse_list response.url:' + response.url)
        self.logger.debug('parse_list response.url:' + response.url)

        item = AnjukeShoprentalItem()

        li = response.css('#list-content>.list-item')
        for i in li:
            item['title'] = i.css('.item-title::text').extract_first().strip()
            item['url'] = i.xpath('@link').extract_first()
            item['price'] = i.css('em::text').extract_first() + i.css(
                '.price-a::text').extract()[1].strip()
            item['img'] = i.css('img::attr(src)').extract_first()
            item['area'] = int(
                i.css('dl > dd:nth-child(2) > span:nth-child(1)::text').
                re_first('[1-9]\d*|0'))
            item['floor'] = i.css(
                'dl > dd:nth-child(2) > span:nth-child(3)::text'
            ).extract_first()
            item['type'] = i.css(
                'dl > dd:nth-child(2) > span:nth-child(5)::text'
            ).extract_first()
            item['community'] = i.css('dd.address > a::text').extract_first()
            # comm_address = i.css('dd.address > span::text').extract_first().split()
            # print("i.css('dd.address > span::text'):" + i.css('dd.address > span::text').extract_first())
            # self.logger.debug("i.css('dd.address > span::text'):" + i.css('dd.address > span::text').extract_first())
            if i.css('dd.address > span::text').extract_first():
                comm_address = i.css(
                    'dd.address > span::text').extract_first().split()
                print('parse_list comm_address:' + str(comm_address))
                self.logger.debug('parse_list comm_address:' +
                                  str(comm_address))
                total_adress = comm_address[0].strip('[').split('-')
                item['district'] = total_adress[0]
                if len(total_adress) > 1:
                    item['location'] = total_adress[1]
                else:
                    item['location'] = ''
                if len(comm_address) > 1:
                    item['address'] = comm_address[1].strip(']')
                else:
                    item['address'] = ''
            else:
                item['district'] = ''
                item['location'] = ''
                item['address'] = ''
            getlocation(item)
            item['number'] = item['url'].split('?', 1)[0].split('/')[-2]
            yield item

        le = LinkExtractor(restrict_css='div.multi-page > a.aNxt')
        print('5' * 200)
        links = le.extract_links(response)
        if links:
            next_url = links[0].url
            print('next_url:', next_url)
            self.logger.debug('next_url:' + next_url)
            yield Request(next_url, callback=self.parse_list)
Exemplo n.º 4
0
    def parse_list(self, response):
        print('parse_list response.url:' + response.url)
        self.logger.debug('parse_list response.url:' + response.url)

        item = QfangTwoItem()

        items = response.css('#cycleListings>ul>li')
        for i in items:
            item['title'] = i.css(
                '.house-title a::text').extract_first().strip()
            item['url'] = 'https://shenzhen.qfang.com' + i.css(
                '.house-title a::attr(href)').extract_first()
            item['total_price'] = int(
                i.css('.sale-price::text').extract_first())
            item['unit_price'] = int(
                i.css('.show-price p::text').re_first(r'[1-9]\d*'))
            item['img'] = i.css(
                'img::attr(data-original)').extract_first().strip()
            item['layout'] = i.css(
                'p.house-about.clearfix > span:nth-child(2)::text'
            ).extract_first()
            item['area'] = float(
                i.css('p.house-about > span:nth-child(4)::text').re_first(
                    '[1-9]\d*\.\d*|0\.\d*[1-9]\d*|[1-9]\d*|0'))
            item['decoration'] = i.css(
                'p.house-about.clearfix > span:nth-child(6)::text'
            ).extract_first()
            item['floor'] = i.css(
                'p.house-about.clearfix > span:nth-child(8)::text'
            ).extract_first().strip()
            item['orientation'] = i.css(
                'p.house-about.clearfix > span:nth-child(10)::text'
            ).extract_first().strip()
            item['build_year'] = int(
                i.css('p.house-about.clearfix > span:nth-child(12)::text').
                re_first(r'[1-9]\d*'))
            item['district'] = i.css(
                'span.whole-line > a:nth-child(1)::text').extract_first()
            item['location'] = i.css(
                'span.whole-line > a:nth-child(2)::text').extract_first()
            item['community'] = i.css(
                'span.whole-line > a:nth-child(3)::text').extract_first()
            getlocation(item)
            item['number'] = item['url'].split('/')[-1].split('?')[0]
            yield item

        le = LinkExtractor(restrict_css='.turnpage_next')
        print('5' * 200)
        links = le.extract_links(response)
        if links:
            next_url = links[0].url
            print('next_url:', next_url)
            self.logger.debug('next_url:' + next_url)
            yield Request(next_url, callback=self.parse_list)
Exemplo n.º 5
0
    def parse_list(self, response):
        print('parse_list response.url:' + response.url)
        self.logger.debug('parse_list response.url:' + response.url)

        item = AnjukeTwoItem()

        li = response.css('.houselist-mod-new>li')
        for i in li:
            item['title'] = i.css(
                '.house-title a::text').extract_first().strip()
            item['url'] = i.css(
                '.house-title a::attr(href)').extract_first().split('?', 1)[0]
            item['total_price'] = float(i.css('strong::text').extract_first())
            item['unit_price'] = int(
                i.css('.unit-price::text').re_first(r'[1-9]\d*|0'))
            item['img'] = i.css('img::attr(src)').extract_first()
            item['layout'] = i.css(
                'div.house-details > div:nth-child(2) > span:nth-child(1)::text'
            ).extract_first()
            item['area'] = i.css(
                'div.house-details > div:nth-child(2) > span:nth-child(3)::text'
            ).extract_first()
            item['floor'] = i.css(
                'div.house-details > div:nth-child(2) > span:nth-child(5)::text'
            ).extract_first()
            item['build_year'] = i.css(
                'div.house-details > div:nth-child(2) > span:nth-child(7)::text'
            ).extract_first()
            if i.css('.comm-address::text').extract_first():
                comm_address = i.css(
                    '.comm-address::text').extract_first().strip().split()
                print('comm_address :', comm_address)
                self.logger.debug('comm_address :' + str(comm_address))
                item['community'] = comm_address[0]
                total_adress = comm_address[1].split('-')
                # print('total_adress :', total_adress)
                item['district'] = total_adress[0]
                item['location'] = total_adress[1]
                item['address'] = total_adress[2]
                getlocation(item)
            item['number'] = item['url'].split('/')[-1].split('?')[0]
            yield item

        le = LinkExtractor(restrict_css='div.multi-page > a.aNxt')
        print('5' * 200)
        links = le.extract_links(response)
        if links:
            next_url = links[0].url
            print('next_url:', next_url)
            self.logger.debug('next_url:' + next_url)
            yield Request(next_url, callback=self.parse_list)
Exemplo n.º 6
0
    def parse_list(self, response):
        print('parse_list response.url:' + response.url)
        self.logger.debug('parse_list response.url:' + response.url)

        item = FangtianxiaTwoItem()

        dl = response.css('.shop_list.shop_list_4>dl[id]')
        for i in dl:
            item['title'] = i.css('.tit_shop::text').extract_first().strip()
            item['url'] = 'http://esf.sz.fang.com' + i.css(
                'h4 a::attr(href)').extract_first()
            item['total_price'] = float(i.css('.red>b::text').extract_first())
            item['unit_price'] = int(
                i.css('.price_right > span:nth-child(2)::text').re_first(
                    r'[1-9]\d*|0'))
            if i.css('.floatl img[src2]'):
                item['img'] = i.css('.floatl img::attr(src2)').extract_first()
            else:
                item['img'] = i.css('.floatl img::attr(src)').extract_first()
            desc = i.css('p.tel_shop::text').extract()
            item['layout'] = desc[0].strip()
            item['area'] = re.search(r'[1-9]\d*', desc[1].strip())[0]
            item['floor'] = desc[2].strip()
            if len(desc) == 6:
                item['orientation'] = desc[3].strip()
                item['build_year'] = desc[4].strip()
            elif len(desc) > 3:
                item['orientation'] = ''
                item['build_year'] = desc[3].strip()
            item['community'] = i.css(
                '.add_shop a::text').extract_first().strip()
            addr = i.css('.add_shop span::text').extract_first().split('-')
            print('addr :', addr)
            self.logger.debug('addr :' + str(addr))
            item['location'] = addr[0].strip()
            item['address'] = addr[1].strip()
            item['distance'] = i.css('.bg_none.icon_dt::text').extract_first()
            getlocation(item)
            item['number'] = item['url'].split('/')[-1].split('.')[0]
            yield item

        next_page = Selector(response).re(u'<a href="(\S*)">下一页</a>')
        if next_page:
            next_url = 'http://sz.esf.fang.com' + next_page[0]
            print('next_url:', next_url)
            self.logger.debug('next_url:' + next_url)
            yield Request(url=next_url, callback=self.parse_list)
Exemplo n.º 7
0
    def parse_list(self, response):
        print('parse_list response.url:' + response.url)
        self.logger.debug('parse_list response.url:' + response.url)

        item = FangtianxiaShopsaleItem()

        dl = response.css('.shop_list>dl[id]')
        for i in dl:
            item['title'] = i.css('.tit_shop::text').extract_first().strip()
            item['url'] = 'http://sz.shop.fang.com' + i.css(
                'h4 a::attr(href)').extract_first()
            item['total_price'] = int(i.css('.red>b::text').extract_first())
            item['unit_price'] = float(
                i.css('dd.price_right > span:nth-child(2) > i::text ').
                extract_first())
            if i.css('.floatl img[src2]'):
                item['img'] = i.css('.floatl img::attr(src2)').extract_first()
            else:
                item['img'] = i.css('.floatl img::attr(src)').extract_first()
            item['area'] = int(i.css('span.color3 > b::text').extract_first())
            if i.css('.add_shop a::text').extract_first():
                item['community'] = i.css(
                    '.add_shop a::text').extract_first().strip()
            else:
                item['community'] = i.css(
                    '.add_shop::text').extract_first().strip()
            if '商铺' in item['community']:
                item['community'] = item['community'].replace('商铺', '')
            item['address'] = i.css(
                'p.add_shop > span::text').extract_first().strip()
            desc = i.css('.tel_shop::text').extract()
            item['type'] = desc[0].split(':')[1].strip()
            item['floor'] = desc[1].split(':')[1].strip()
            getlocation(item)
            item['number'] = item['url'].split('/')[-1].split('.')[0]
            yield item

        le = LinkExtractor(restrict_css='#PageControl1_hlk_next')
        print('5' * 200)
        links = le.extract_links(response)
        if links:
            next_url = links[0].url
            print('next_url:', next_url)
            self.logger.debug('next_url:' + next_url)
            yield Request(next_url, callback=self.parse_list)
Exemplo n.º 8
0
    def parse_list(self, response):
        print('parse_list response.url:' + response.url)
        self.logger.debug('parse_list response.url:' + response.url)

        item = AnjukeNewItem()

        items = response.css('.key-list .item-mod')
        for i in items:
            item['title'] = i.css('.items-name::text').extract_first()
            item['url'] = i.css('.lp-name::attr(href)').extract_first()
            if i.css('.price-txt::text'):
                item['no_price'] = i.css('.price-txt::text').extract_first()
            if i.css('.price::text'):
                p = i.css('.price::text').extract()
                q = i.css('.price > span::text').extract()
                item['price'] = p[0].strip() + q[0] + p[1].strip()
            if i.css('.around-price::text'):
                p = i.css('.around-price::text').extract()
                q = i.css('.around-price > span::text').extract()
                item['price'] = p[0].strip() + q[0] + p[1].strip()
            item['phone'] = i.css('p.tel::text').extract_first()
            if i.css('.list-dp::text'):
                item['comment'] = int(i.css('.list-dp::text').re_first(r'[1-9]\d*|0'))
            item['img'] = i.css('img::attr(src)').extract_first()
            item['layout'] = '/'.join(i.css('a.huxing > span::text').extract()[0:-1])
            item['area'] = i.css('a.huxing > span::text').extract()[-1]
            comm_address = i.css('.list-map::text').extract_first().strip().split()
            item['district'] = comm_address[1]
            item['location'] = comm_address[2]
            item['address'] = comm_address[-1]
            item['status'] = i.css('i.status-icon.forsale::text').extract_first()
            item['type'] = i.css('i.status-icon.wuyetp::text').extract_first()
            getlocation(item)
            item['number'] = item['url'].split('/')[-1].split('.')[0]
            yield item

        le = LinkExtractor(restrict_css='a.next-page.next-link')
        print('5' * 200)
        links = le.extract_links(response)
        if links:
            next_url = links[0].url
            print('next_url:', next_url)
            self.logger.debug('next_url:' + next_url)
            yield Request(next_url, callback=self.parse_list)
Exemplo n.º 9
0
    def parse_list(self, response):
        print('parse_list response.url:' + response.url)
        self.logger.debug('parse_list response.url:' + response.url)

        item = LianjiaTwoItem()

        li = response.css('.sellListContent>.clear.LOGCLICKDATA')
        for i in li:
            item['title'] = i.css('.title a::text').extract_first().strip()
            item['url'] = i.css('.title a::attr(href)').extract_first()
            item['total_price'] = float(i.css('.totalPrice span::text').extract_first())
            item['unit_price'] = int(i.css('.unitPrice span::text').re_first(r'[1-9]\d*|0'))
            item['img'] = i.css('.lj-lazy::attr(data-original)').extract_first()
            item['community'] = i.css('.address a::text').extract_first()
            desc = i.css('.houseInfo::text').extract_first().split('|')
            if len(desc) == 6:
                item['layout'] = desc[1].strip()
                item['area'] = re.findall(r'[1-9]\d*|0', desc[2].strip())[0]
                item['orientation'] = desc[3].strip()
                item['decoration'] = desc[4].strip()
                item['elevator'] = desc[5].strip()
            elif len(desc) == 5:
                item['layout'] = desc[1].strip()
                item['area'] = re.findall(r'[1-9]\d*|0', desc[2].strip())[0]
                item['orientation'] = desc[3].strip()
                item['decoration'] = desc[4].strip()
                item['elevator'] = ''
            item['floor'] = i.css('.positionInfo::text').extract_first().split('-')[0].strip()
            item['location'] = i.css('.positionInfo a::text').extract_first()
            num = i.css('.followInfo::text').extract_first().split('/')
            print('num:{}'.format(num))
            self.logger.debug('num:{}'.format(num))
            if num:
                item['focus_num'] = num[0].strip()
                item['watch_num'] = num[1].strip()
                item['pubdate'] = num[2].strip()
            getlocation(item)
            item['number'] = item['url'].split('/')[-1].split('.')[0]
            yield item
Exemplo n.º 10
0
    def parse_list(self, response):
        print('parse_list response.url:' + response.url)
        self.logger.debug('parse_list response.url:' + response.url)

        item = TongchengTwoItem()

        li = response.css('.house-list-wrap>li')
        for i in li:
            item['title'] = i.css('.title a::text').extract_first().strip()
            item['url'] = i.css('.title a::attr(href)').extract_first()
            item['total_price'] = float(i.css('.sum b::text').extract_first())
            item['unit_price'] = int(i.css('.unit::text').re_first(r'[1-9]\d*|0'))
            item['time'] = i.css('.time::text').extract_first()
            item['img'] = i.css('img::attr(data-src)').extract_first()
            item['layout'] = i.css('div.list-info > p:nth-child(2) > span:nth-child(1)::text').extract_first().strip()
            print('parse_list area:' + i.css('div.list-info > p:nth-child(2) > span:nth-child(2)::text').extract_first())
            self.logger.debug('parse_list area:' + i.css('div.list-info > p:nth-child(2) > span:nth-child(2)::text').extract_first())
            item['area'] = float(i.css('div.list-info > p:nth-child(2) > span:nth-child(2)::text').re_first('[1-9]\d*\.\d*|0\.\d*[1-9]\d*|[1-9]\d*|0'))
            item['orientation'] = i.css('div.list-info > p:nth-child(2) > span:nth-child(3)::text').extract_first()
            item['floor'] = i.css('div.list-info > p:nth-child(2) > span:nth-child(4)::text').extract_first()
            item['community'] = i.css('div.list-info > p:nth-child(3) > span:nth-child(1) > a:nth-child(1)::text').extract_first()
            item['district'] = i.css('div.list-info > p:nth-child(3) > span:nth-child(1) > a:nth-child(2)::text').extract_first()
            if i.css('div.list-info > p:nth-child(3) > span:nth-child(1) > a:nth-child(3)::text'):
                item['location'] = i.css('div.list-info > p:nth-child(3) > span:nth-child(1) > a:nth-child(3)::text').extract_first()
            else:
                item['location'] = ''
            getlocation(item)
            item['number'] = item['url'].split('/')[-1].split('.')[0]
            yield item

        le = LinkExtractor(restrict_css='div.pager > a.next')
        print('5' * 200)
        links = le.extract_links(response)
        if links:
            next_url = links[0].url
            print('next_url:', next_url)
            self.logger.debug('next_url:' + next_url)
            yield Request(next_url, callback=self.parse_list)
Exemplo n.º 11
0
    def parse_list(self, response):
        print('parse_list response.url:' + response.url)
        self.logger.debug('parse_list response.url:' + response.url)

        item = QfangNewItem()

        items = response.css('#newhouse-list > .clearfix')
        for i in items:
            item['title'] = i.css(
                '.house-title a::text').extract_first().strip()
            # if i.css('.alias-text::text'):
            # item['alias'] = i.css('.alias-text::text').extract_first()
            item['alias'] = i.css(
                'div.house-title.clearfix > span::text').extract_first()
            item['url'] = 'https://shenzhen.qfang.com' + i.css(
                '.house-title a::attr(href)').extract_first()
            item['status'] = i.css('.state-label::text').extract_first()
            # item['status'] = i.css('div.house-title.clearfix > span::text').extract_first()
            item['unit_price'] = i.css('.sale-price::text').extract_first()
            if i.css('.show-price p::text'):
                item['total_price'] = i.css(
                    '.show-price p::text').extract_first()
            item['img'] = i.css('img::attr(src)').extract_first().strip()
            desc = i.css('div.natures > span::text').extract()
            if len(desc) == 3:
                item['district'] = desc[0].split()[0]
                item['location'] = desc[0].split()[1]
                item['type'] = ' '.join(desc[1].strip().split())
                item['decoration'] = desc[2].strip()
            elif len(desc) == 2:
                item['district'] = desc[0].split()[0]
                item['location'] = desc[0].split()[1]
                item['decoration'] = desc[1].strip()
            # item['district'] = i.css('div.natures > span:nth-child(1)::text').extract_first().split()[0]
            # item['location'] = i.css('div.natures > span:nth-child(1)::text').extract_first().split()[1]
            # item['type'] = i.css('div.natures > span:nth-child(3)::text').extract_first().strip()
            # item['decoration'] = i.css('div.natures > span:nth-child(5)::text').extract_first().strip()
            item['layout'] = ' '.join(
                i.css('div.new-house-dsp > p:nth-child(1) > span::text').
                extract())
            item['area'] = i.css(
                'div.new-house-dsp > p:nth-child(2) > span::text'
            ).extract_first()
            item['time'] = i.css(
                'div.new-house-dsp > p:nth-child(3) > span::text'
            ).extract_first().strip()
            item['address'] = i.css(
                'div.new-house-dsp > p:nth-child(4) > span::text'
            ).extract_first().strip()
            if i.css('p.new-house-phone > em'):
                phone_list = i.css('p.new-house-phone::text').extract()
                phone_text = i.css(
                    'p.new-house-phone > em::text').extract_first()
                item['phone'] = phone_list[0].strip(
                ) + phone_text + phone_list[1].strip()
            else:
                item['phone'] = i.css(
                    'p.new-house-phone::text').extract_first()
            getlocation(item)
            item['number'] = item['url'].split('/')[-1].split('?')[0]
            yield item

        le = LinkExtractor(restrict_css='.turnpage_next')
        print('5' * 200)
        links = le.extract_links(response)
        if links:
            next_url = links[0].url
            print('next_url:', next_url)
            self.logger.debug('next_url:' + next_url)
            yield Request(next_url, callback=self.parse_list)