示例#1
0
    def parse_news(self, response):
        if '_' not in response.url:
            pages = set(find(response, '//div[@class="page"]//a/@href', False))
            for page in pages:
                yield Request(response.meta['house_link'].strip('/') + page,
                              callback=self.parse_news,
                              meta={'house_id': response.meta['house_id']})
        story_list = response.xpath('//li[@class="storyList"]')
        if not story_list:
            self.logger.warning('no story %s', response.url)
            return
        news = []
        for story in story_list:
            try:
                title = find(story, './h2/a/text()')
                link = find(story, './h2/a/@href')
                news_content = {'news_title': title, 'news_link': link}
            except:
                news_content = ' '.join(find(story, './p/text()', False))
            news.append({
                'update_at': find(story, './div/text()'),
                'news_content': news_content
            })

        yield {
            'house_id': response.meta['house_id'],
            'table': self.name,
            'item': ('news', news)
        }
示例#2
0
    def parse_news(self, response):
        story_list = response.xpath('//div[@id="all_hidden"]/div')
        if not story_list:
            self.logger.warning('no story %s', response.url)
            return
        news = []
        for story in story_list:
            link = find(story, './@link')
            _news = {
                'news_content': {
                    'news_link': link
                },
                'update_at': link.split('/')[-2]
            }
            try:
                img = story.xpath('.//img')[0]
                _news['news_content'].update({
                    'news_title': find(img, './@alt'),
                    'img_link': find(img, './@src')
                })
            except:
                pass
            news.append(_news)

        yield {
            'house_id': response.meta['house_id'],
            'table': self.name,
            'item': ('news', news)
        }
示例#3
0
    def parse_news(self, response):
        if response.url.rstrip('/')[-1].isdigit() is False:
            xp = '//div[@class="module-pagination"]/a/@href'
            pages = find(response, xp, False)[4:]
            for page in pages:
                yield Request(page,
                              callback=self.parse_news,
                              meta={'house_id': response.meta['house_id']})
        story_list = response.xpath('//div[@class="new-review-title"]')
        if not story_list:
            self.logger.warning('no story %s', response.url)
            return
        news = [{
            'update_at': find(story, './div[2]/text()'),
            'news_content': {
                'news_title': find(story, './div[1]/a/text()'),
                'news_link': find(story, './div[1]/a/@href')
            }
        } for story in story_list]

        yield {
            'house_id': response.meta['house_id'],
            'table': self.name,
            'item': ('news', news)
        }
示例#4
0
 def parse_pic_link(self, response):
     pics = response.xpath(sf.PICS)
     if not pics:
         self.logger.warning('pictures unreachable %s', response.url)
         return
     host = response.url.split('/photo')[0]
     for pic in pics:
         pic_total_num = find(pic, './em/text()')
         pic_label = find(pic, './span/text()')
         if pic_label == '户型':
             continue
         pic_id = find(pic, './@href').split('list_')[-1].split('_')[0]
         # TODO: 只拿了前6个,后面的要拿的话 parse_pic yield后会在 mongopipeline
         # TODO: set item, 会将前面的覆盖
         num = int(int(pic_total_num) / 6) + 1
         # 只拿前3页
         num = 4 if num > 3 else num
         for page in range(1, num):
             url = self.picture_url.format(host, response.meta['house_id'],
                                           pic_id, page)
             yield Request(url=url,
                           callback=self.parse_pic,
                           meta={
                               'label': pic_label,
                               'house_id': response.meta['house_id'],
                           })
示例#5
0
文件: qq.py 项目: broholens/GoodHouse
    def parse_xiangce(self, response):
        """TODO: 如果超过8张,拿不全图片"""
        pics = response.xpath(q.PICS)
        if not pics:
            self.logger.error('pic not found %s', response.url)
            return
        pictures = {
            'house_id': response.meta['house_id'],
            'table': self.name,
        }
        album = []
        # 房型
        if find(pics[0], './@id') == '_apartment':
            yield {
                'new_data':
                True,
                'house_id':
                response.meta['house_id'],
                'table':
                self.name + '_room',
                'room_album': [{
                    'room_label': find(item, './div[2]/a/text()'),
                    'room_url': find(item, q.IMG)
                } for item in pics[0].xpath('.//ul/li')]
            }

        # 所有类型图片
        for pic in pics:
            title = find(pic, q.TITLE)
            for src in find(pic, q.IMG, False):
                album.append({'picture_title': title, 'picture_url': src})
        pictures.update({'item': ('album', album)})
        yield pictures
示例#6
0
 def parse_xiangce_link(self, response):
     pics = response.xpath(shjd_xp.PICTURE_URLS)
     if not pics:
         self.logger.warning('picture unreachable! %s', response.url)
         return
     for pic in pics:
         yield Request(url=find(pic, './@href'),
                       callback=self.parse_xiangce,
                       meta={
                           'house_id': response.meta['house_id'],
                           'label': find(pic, './text()').split('(')[0]
                       })
示例#7
0
    def parse_room(self, response):
        # 解析户型数据
        room = {
            'new_data': True,
            'house_id': response.url.split('/')[-1].split('-')[0],
            'table': self.name + '_room'
        }
        pics = response.xpath(ajk_xp.ROOM_TYPE_PICS)
        if not pics:
            self.logger.warning('room pictures is empty. %s', response.url)
        else:
            room['room_album'] = [{
                'picture_title':
                find(item, './@data-title'),
                'picture_url':
                find(item, './img/@imglazyload-src')
            } for item in pics]

        room['room_type'] = find(response, ajk_xp.ROOM_TITLES).split(',')[0]

        labels = find(response, ajk_xp.ROOM_LABELS, False)
        if not labels:
            self.logger.warning('room labels is empty %s', response.url)
        else:
            room['room_sale_status'] = labels[0]
            room['room_labels'] = [label for label in labels[1:]]

        price = response.xpath(ajk_xp.ROOM_PRICE)
        if not price:
            self.logger.warning('room price is empty %s', response.url)
        else:
            for item in price:
                name = find(item, './/strong/text()')
                if name not in room_price_dict:
                    self.logger.warning('key %s unknown %s', name,
                                        response.url)
                    continue
                room[room_price_dict[name]] = find(item, './span/text()')

        room_details = response.xpath(ajk_xp.ROOM_DETAILS)
        if not room_details:
            self.logger.warning('room details is empty %s', response.url)
        else:
            for item in room_details:
                name = find(item, './strong/text()')
                if name not in room_details_dict:
                    self.logger.warning('key %s unknown %s', name,
                                        response.url)
                    continue
                room[room_details_dict[name]] = find(item, './span/text()')

        room_description = find(response, ajk_xp.ROOM_DESCRIPTION, False)
        if not room_description:
            self.logger.warning('room description is empty %s', response.url)
        else:
            room['room_description'] = ' '.join(room_description)

        yield room
示例#8
0
 def parse_xiangce(self, response):
     yield {
         'house_id':
         response.meta['house_id'],
         'table':
         self.name,
         'item': ('album', [{
             'picture_url': find(img, './@src'),
             'picture_label': response.meta['label'],
             'picture_description': find(img, './@data-name')
         } for img in response.xpath(shjd_xp.PICTURES)])
     }
示例#9
0
    def base(self, house, item, d):
        key = find(item, f'./td[@class="label-{d}"]/text()') and \
            find(item, f'./td[@class="label-{d}"]/text()').strip(':')
        value = find(item, f'./td[@class="text-{d}"]/text()') or \
            find(item, f'./td[@class="text-full"]/text()')
        if not key or key == '售楼电话':
            return

        if key not in base_info_dict:
            self.logger.warning('unknown key %s', key)
            return
        house[base_info_dict[key]] = value
示例#10
0
    def parse_house_link(self, response):
        # 获取每页所有房源链接并逐一遍历
        house_links = find(response, ajk_xp.HOUSE_LINKS, False)
        if not house_links:
            self.logger.error('cannot find house link of %s', response.url)
            return
        for house_link in house_links:
            house_id = house_link.rstrip('/').split('/')[-1].split('.')[0]
            city = house_link.split('.')[0].split('/')[-1]
            host = house_link.split(house_id)[0]
            url = host + f'canshu-{house_id}.html'
            # 基本参数
            yield Request(url,
                          callback=self.parse_house,
                          meta={
                              'house_id': house_id,
                              'city': CITY[city]
                          })

            # 户型
            yield Request(url.replace('canshu', 'huxing'),
                          callback=self.parse_room_count)

            # 图片
            yield Request(url.replace('canshu', 'xiangce'),
                          callback=self.parse_pic,
                          meta={'house_id': house_id})
            # 动态
            yield Request(url.replace('canshu', 'officialnews'),
                          callback=self.parse_news,
                          meta={'house_id': house_id})
示例#11
0
 def parse(self, response):
     total_count = find(response, shjd_xp.TOTAL_COUNT)
     if not total_count:
         self.logger.error('total count not clear! %s', response.url)
         return
     total_pages = int(ceil(int(total_count) / 20))
     # if response.url.startswith('http://sz.focus.cn/'):
     #     url = response.url.strip('.html') + '_p{}' + '.html'
     #     url_xiangqing = 'http://sz.focus.cn/loupan/' + '{}/xiangxi/'
     #     url_huxing = 'http://sz.focus.cn/loupan/' + '{}/huxing/'
     #     url_xiangce = 'http://sz.focus.cn/loupan/' + '{}/tu/'
     # else:
     url = response.url + 'p{}/'
     url_xiangqing = response.url + '{}/xiangqing.html'
     url_huxing = response.url + '{}/huxing/'
     url_xiangce = response.url + '{}/xiangce/'
     url_dongtai = response.url + '{}/dongtai/'
     for page in range(1, total_pages + 1):
         yield Request(url.format(page),
                       callback=self.parse_house_link,
                       meta={
                           'xq': url_xiangqing,
                           'hx': url_huxing,
                           'xc': url_xiangce,
                           'dt': url_dongtai
                       })
示例#12
0
 def parse_room_url(self, response):
     # 解析所有户型链接并逐一遍历
     urls = find(response, ajk_xp.ROOM_URLS, False)
     if not urls:
         self.logger.warning('room urls is empty. %s', response.url)
         return
     for url in urls:
         yield Request(url, callback=self.parse_room)
示例#13
0
    def parse_huxing(self, response):
        room = {
            'new_data': True,
            'house_id': response.meta['house_id'],
            'table': self.name + '_room',
        }

        room_pics = find(response, shjd_xp.ROOM_PICS, False)
        if not room_pics:
            self.logger.warning('room pictures unreachable! %s', response.url)
        else:
            room['room_album'] = [{'picture_url': img} for img in room_pics]

        room['room_type'] = find(response, shjd_xp.ROOM_TYPE)
        room['room_sale_status'] = find(response, shjd_xp.SALE_STATUS)
        price = ''.join(find(response, shjd_xp.ROOM_PRICE, False))
        room['reference_price'] = price

        room_info = response.xpath(shjd_xp.ROOM_INFO)
        if not room_info:
            self.logger.warning('room info unreachable! %s', response.url)
        else:
            for item in room_info:
                key = find(item, './label/text()')
                self.room_details_dict[key] = find(item, './text()')

        room_des = response.xpath(shjd_xp.ROOM_DESCRIPTION)
        if not room_des:
            self.logger.warning('room description empty! %s', response.url)
        else:
            room['room_description'] = find(room_des, '../div/text()')

        yield room
示例#14
0
    def parse_house(self, response):
        # 解析房源的基本参数
        house = {
            'new_data': True,
            'sale_status': find(response, ajk_xp.SALE_STATUS),
            'house_id': response.meta['house_id'],
            'city': response.meta['city'],
            'table': self.name
        }
        # 参数
        for item in response.xpath(ajk_xp.ITEMS):
            name = find(item, ajk_xp.NAME)
            if not name or name in ['楼盘图片', '售楼处电话']:
                continue
            if name not in base_info_dict:
                self.logger.warning('name %s unknown %s', name, response.url)
                continue
            if name in ['楼盘名称', '开发商', '物业公司']:
                value = find(item, './/a/text()') \
                        or find(item, './div[2]/text()')
            elif name in ['楼盘特点', '楼盘户型']:
                value = find(item, './/a/text()', False)
                if name == '楼盘户型':
                    value = house_type_split(value)
            elif name in ['区域位置', '参考单价']:
                value = ''.join(find(item, './/text()', False)).strip()
                value = value.lstrip(name).rstrip('[价格走势]').strip()
            else:
                value = find(item, './div[contains(@class, "des")]/text()')
            house[base_info_dict[name]] = value

        yield house
示例#15
0
文件: qq.py 项目: broholens/GoodHouse
    def parse_xiangqing(self, response):
        house = {
            'new_data': True,
            'table': self.name,
            'house_id': response.meta['house_id'],
            'city': CITY[response.meta['city']],
            'building_name': find(response, q.NAME),
            'alias_name': find(response, q.ALIAS),
            'description': ''.join(find(response, q.DESCRIPTION, False))
        }
        for div_id in ['basics', 'saleIntro', 'building', 'property']:
            xp = f'//div[@id="{div_id}"]/div[2]/ul/li'
            for item in response.xpath(xp):
                name = find(item, './span/text()')
                if not name:
                    continue
                if name not in base_info_dict:
                    self.logger.warning('name %s not in dict %s', name,
                                        response.url)
                    continue
                house[base_info_dict[name]] = find(item, './p/text()')
        try:
            other_info = ' '.join(find(response, q.OTHER_INFO_MORE, False))
        except:
            other_info = ' '.join(find(response, q.OTHER_INFO, False))
        house.update({'transportation': other_info})

        yield house
示例#16
0
 def parse(self, response):
     pages = find(response, sf.PAGE_COUNT)
     if not pages:
         self.logger.error('cannot find pages of %s', response.url)
         return
     pages = int(pages.strip('/'))
     for page in range(1, pages + 1):
         url = response.url.rstrip('/') + f'/b9{page}/'
         yield Request(url, callback=self.parse_house_link)
示例#17
0
 def parse_huxing_link(self, response):
     rooms = find(response, shjd_xp.ROOM, False)
     if not rooms:
         self.logger.warning('room urls unreachable! %s', response.url)
         return
     for room in rooms:
         yield Request(room,
                       callback=self.parse_huxing,
                       meta={'house_id': response.meta['house_id']})
示例#18
0
 def parse_room_count(self, response):
     # 根据户型总数遍历每一页
     total = find(response, ajk_xp.ROOM_COUNT)
     if not total:
         self.logger.warning('room count is empty. %s', response.url)
         return
     for page in range(1, int(total) // 8 + 1 + 1):
         url = str(response.url).replace('.html', f'/s?p={page}')
         yield Request(url, callback=self.parse_room_url)
示例#19
0
    def parse_xiangqing(self, response):
        city = response.url.split('.')[0].split('/')[-1]
        house = {
            'new_data': True,
            'city': CITY[city],
            'table': self.name,
            'house_id': response.meta['house_id'],
            'alias_name': find(response, shjd_xp.OTHER_NAME),
        }
        labels = find(response, shjd_xp.LABELS, False)
        if not labels:
            self.logger.warning('empty labels! %s', response.url)
        else:
            house['labels'] = labels

        base_items = response.xpath(shjd_xp.INFO)
        if not base_items:
            self.logger.warning('base info is empty! %s', response.url)
        else:
            for item in base_items:
                self.base(house, item, 'l')
                self.base(house, item, 'r')

        licenses = response.xpath(shjd_xp.LICENSE)
        if not licenses:
            self.logger.warning('license info is empty! %s', response.url)
        else:
            house['license'] = [{
                'license_number':
                find(item, './td[1]/text()'),
                'license_start_at':
                find(item, './td[2]/span/text()'),
                'bind_building':
                find(item, './td[3]/text()'),
            } for item in licenses]

        price = response.xpath(shjd_xp.PRICE)
        if not price:
            self.logger.warning('price unreachable! %s', response.url)
        else:
            house['price_history'] = [
                {
                    'release_time': find(item, './td[1]/span/text()'),
                    # 'highest_price': find(item, './td[2]/text()'),
                    # 'avg_price': find(item, './td[3]/span/text()'),
                    # 'lowest_price': find(item, './td[4]/text()'),
                    'price_details': find(item, './td[last()]/text()')
                } for item in price
            ]
            house['price'] = house['price_history'][0]['price_details']

        house['description'] = find(response, shjd_xp.DESCRIPTION)

        yield house
示例#20
0
文件: qq.py 项目: broholens/GoodHouse
    def parse_news(self, response):
        # TODO: 只获取了第一页
        story_list = response.xpath('//div[@class="bd"]')
        if not story_list:
            self.logger.warning('no story %s', response.url)
            return

        news = [{
            'update_at': find(story, './div/span/text()'),
            'news_content': {
                'news_link': find(story, './/h3/a/@href'),
                'news_title': find(story, './/h3/a/text()')
            }
        } for story in story_list]

        yield {
            'house_id': response.meta['house_id'],
            'table': self.name,
            'item': ('news', news)
        }
示例#21
0
    def parse(self, response):
        # 获取总页数并逐一遍历
        pages = find(response, ajk_xp.TOTAL_PAGES)
        if not pages:
            self.logger.error('cannot find pages of %s', response.url)
            return
        # 每页有50条数据
        pages = int(pages) // 50 + 1

        for page in range(1, pages + 1):
            url = response.url.rstrip('/') + f'/loupan/all/p{page}/'
            yield Request(url, callback=self.parse_house_link)
示例#22
0
    def parse_pictorial(self, response):
        # 画报与图片不同
        pic_items = response.xpath(ajk_xp.PIC_ITEMS)
        if not pic_items:
            self.logger.warning('pictorial is empty %s', response.url)
            return

        yield {
            'house_id':
            response.meta['house_id'],
            'table':
            self.name,
            'item': ('pictorial', [{
                'picture_url':
                find(item, './/img/@data-src'),
                'picture_title':
                find(item, './/h3/text()'),
                'picture_description':
                find(item, './/p/text()')
            } for item in pic_items][:-2])
        }
示例#23
0
    def parse_pic(self, response):
        # 解析图片参数
        labels = find(response, ajk_xp.PIC_HEADER, False)
        if '画报' in labels:
            labels.remove('画报')
            urls = find(response, ajk_xp.PICTORIAL, False)
            for url in urls:
                yield Request(url.split('?')[0],
                              callback=self.parse_pictorial,
                              meta={'house_id': response.meta['house_id']})

        html = ''.join(response.text.replace('\n', '').split(' '))
        data = self.ptn_pic_loc.findall(html)
        if not data:
            self.logger.error('picture info not found %s', response.url)
            return

        data = data[0].replace('big', '"big"').replace('small', '"small"')\
            .replace('image_id', '"image_id"').replace(' ', ' ')\
            .replace('image_des', '"image_des"').replace('\'', '"') + ']'

        try:
            data = json.loads(data)
        except:
            self.logger.error('json loads error %s', response.url)
            return

        yield {
            'house_id':
            response.meta['house_id'],
            'table':
            self.name,
            'item':
            ('album', [{
                'picture_label': label,
                'picture_url': url,
                'picture_description': des
            } for label, pic in zip(labels, data)
                       for url, des in zip(pic['big'], pic['image_des'])])
        }
示例#24
0
    def parse_house_link(self, response):
        house_links = find(response, sf.HOUSE_LINK, False)
        if not house_links:
            self.logger.error('cannot find house_link of %s', response.url)
            return
        house_ids = self.ptn_house_id.findall(response.text)
        if not house_ids:
            self.logger.error('house ids not found! %s', response.url)
            return
        # house_labels = response.content.decode('gb2312', 'replace')
        city = response.url.split('.fang.')[0].split('.')[-1]
        house_ids = house_ids[0].split(',')
        for house_link, house_id in zip(house_links, house_ids):
            house_link = house_link.split('/?')[0]
            # 基本参数
            yield Request(house_link + f'house/{house_id}/housedetail.htm',
                          callback=self.parse_house,
                          meta={
                              'house_id': house_id,
                              'city': city
                          })

            # 户型
            yield Request(self.huxing_url.format(house_link, house_id),
                          callback=self.parse_room,
                          meta={'house_id': house_id})

            # 图片
            yield Request(house_link + f'photo/{house_id}.htm',
                          callback=self.parse_pic_link,
                          meta={'house_id': house_id})

            # 动态
            yield Request(house_link + f'house/{house_id}/dongtai.htm',
                          callback=self.parse_news,
                          meta={
                              'house_id': house_id,
                              'house_link': house_link
                          })
示例#25
0
    def parse_house_link(self, response):
        house_ids = find(response, shjd_xp.HOUSE_IDS, False)
        if not house_ids:
            self.logger.error('house ids not clear! %s', response.url)
            return

        for house_id in house_ids:
            # 详情
            yield Request(url=response.meta['xq'].format(house_id),
                          callback=self.parse_xiangqing,
                          meta={'house_id': house_id})
            # 户型
            yield Request(url=response.meta['hx'].format(house_id),
                          callback=self.parse_huxing_link,
                          meta={'house_id': house_id})
            # 相册
            yield Request(url=response.meta['xc'].format(house_id),
                          callback=self.parse_xiangce_link,
                          meta={'house_id': house_id})
            # 动态
            yield Request(url=response.meta['dt'].format(house_id),
                          callback=self.parse_news,
                          meta={'house_id': house_id})
示例#26
0
    def parse_house(self, response):
        house = {
            # if false then push the data to array
            'new_data': True,
            'house_id': response.meta['house_id'],
            'city': CITY[response.meta['city']],
            'table': self.name,
            'building_name': find(response, sf.NAME),
            'alias_name': find(response, sf.ALIAS),
            'feature': find(response, sf.LABELS, False),
            'price': find(response, sf.PRICE),
            'description': find(response, sf.DESCRIPTION)
        }
        # 参数
        for item in response.xpath(sf.INFO):
            name = find(item, './div[1]/text() | ./span/text()')
            if not name:
                continue
            name = name.strip(':')
            if name in ['项目特色', '楼盘特色', '预售许可证', '咨询电话']:
                continue
            if name not in self.kw_dict:
                self.logger.warning('name %s unknown %s', name, response.url)
                continue
            name = self.kw_dict[name]
            if isinstance(name, str):
                value = find(item, './div[2]/text()')
            else:
                name, value = name[0], find(item, name[1])
            house[name] = value

        history = response.xpath(sf.HISTORY)
        if not history:
            self.logger.warning('history unreachable! %s', response.url)
        else:
            if len(history) > 1:
                licenses = history[0].xpath('.//tr[position()>1]')
                house['license'] = [{
                    'license_number':
                    find(item, './td[1]/text()'),
                    'license_start_at':
                    find(item, './td[2]/text()'),
                    'bind_building':
                    find(item, './td[3]/text()'),
                } for item in licenses]
                price = history[1].xpath('.//tr[position()>1]')
            else:
                price = history[0].xpath('.//tr[position()>1]')

            house['price_history'] = [{
                'release_time':
                find(item, './td[1]/text()'),
                'price_details':
                find(item, './td[last()]/text()')
            } for item in price]

        yield house