Exemplo n.º 1
0
    def parse(self, response):
        meta = response.meta
        item = meta['item']
        item['house_layout'] = self.parse_layout(response)

        images_url = meta['root_url'] + 'xiangce/'
        yield from ParseUtil.start_request(images_url, ImagesParser().parse, meta)
Exemplo n.º 2
0
 def start_requests(self):
     item = HousemarkettrackerItem()
     # item['house_name'] = building.xpath('@title').extract_first()
     # item['home_page'] = root_url
     meta = {}
     meta['item'] = item
     meta['root_url'] = 'https://cd.fang.lianjia.com/loupan/p_sdwdgcabfld/'
     root_url = self.start_urls[0]
     yield from ParseUtil.start_request(root_url, self.parse, meta)
Exemplo n.º 3
0
    def parse(self, response):
        buildings = response.xpath('/html/body/div[4]/ul[2]/li/a')

        for building in buildings:
            root_url = "https://cd.fang.lianjia.com" + building.xpath(
                '@href').extract_first()

            item = HousemarkettrackerItem()
            item['house_name'] = building.xpath('@title').extract_first()
            item['home_page'] = root_url

            meta = {}
            meta['item'] = item
            meta['root_url'] = root_url

            detail_url = root_url + 'xiangqing/'
            print('start: ------' + root_url)
            yield from ParseUtil.start_request(detail_url,
                                               DetailParser().parse, meta)
    def parse(self, response):
        meta = response.meta
        item = meta['item']

        basic_dict = self.parse_basic_info(response)
        plan_dict = self.parse_planning_info(response)
        facility_dict = self.parse_ancillary_facility(response)
        pre_sales_list = self.parse_pre_sales(response)
        opening_info_list = self.parse_sales_info(response)

        meta = response.meta
        item['house_detail'] = HouseDetail(basic_dict, opening_info_list,
                                           plan_dict, pre_sales_list,
                                           facility_dict).__dict__
        meta['item'] = item
        home_page_url = meta['root_url']
        yield from ParseUtil.start_request_with_lua(
            home_page_url,
            HouseHomePageParser().parse, meta)
    def parse(self, res):
        news_list = []
        news_div_s = res.xpath('//div[@class="dongtai-one for-dtpic"]')
        for new_div in news_div_s:
            news_dict = {}
            news_dict['tag'] = new_div.xpath(
                'a/span[@class="a-tag"]/text()').extract_first()
            news_dict['title'] = new_div.xpath(
                'a/span[@class="a-title"]/text()').extract_first()
            news_dict['time'] = new_div.xpath(
                'a/span[@class="a-time"]/text()').extract_first()
            news_dict['content'] = new_div.xpath(
                'child::*//div[@class="a-word"]/a/text()').extract_first()
            news_dict['link'] = new_div.xpath(
                'child::*//div[@class="a-word"]/a/@href').extract_first()

            news_list.append(news_dict)

        meta = res.meta
        item = meta['item']

        if item.get('house_news') is None:
            item['house_news'] = news_list
        else:
            item['house_news'] += news_list

        page = res.xpath('//div[@class="page-box"]')
        current_page_str = page.xpath('@data-current').extract_first()
        if current_page_str is None:
            yield item
        else:
            current_page_index = int(current_page_str)
            total_count = int(page.xpath('@data-total-count').extract_first())
            total_pages = ceil(total_count / 20.0)
            if current_page_index < total_pages:
                next_page_url = meta['root_url'] + 'dongtai/pg' + str(
                    current_page_index + 1)
                yield from ParseUtil.start_request(next_page_url,
                                                   NewsParser().parse, meta)
            else:
                yield item
Exemplo n.º 6
0
    def parse(self, response):
        meta = response.meta
        item = meta['item']

        all_album_divs = response.xpath('//div[@class="tab-group"]')
        image_dict = {}
        for album_div in all_album_divs:
            title = album_div.xpath('h4/a/text()').extract_first()
            title = re.search(r'(.+?)(\d*)', title).group(1)
            image_li_s = album_div.xpath('ul/li')
            image_list = []
            for image_li in image_li_s:
                image_url = image_li.xpath('a/img/@src').extract_first()
                image_url = re.sub(r'235x178', '1000x', image_url)
                image_list.append(image_url)
            image_dict[title] = image_list
        item['house_images'] = image_dict

        comments_url = meta['root_url'] + 'pinglun/'
        yield from ParseUtil.start_request(comments_url,
                                           CommentParser().parse, meta)
Exemplo n.º 7
0
    def parse_comments(self, res):
        li_s = res.xpath('//li[@data-role="commentitem"]')
        comments_in_page = []
        for li in li_s:
            comment_content_dict = {}
            # user
            user = li.xpath('div[@class="l_userpic"]')
            comment_content_dict['user_image'] = user.xpath(
                '//img/@src').extract_first()
            user_line = user.xpath('div[@class="info"]//text()').extract()
            comment_content_dict['user_name'] = self.normalize_space(
                user_line[0])
            if len(user_line) > 1:  # visitor don't have user life info
                comment_content_dict['user_life'] = self.normalize_space(
                    user_line[1])
            # comment
            comment = li.xpath('div[@class="r_comment"]')
            comment_content_dict['tag'] = comment.xpath(
                'span[@class="tag"]/text()').extract_first()
            star = comment.xpath(
                'child::*//div[@class="star_info"]/@style').extract_first()
            star = 5 * int(re.match('.+?(\d+)%', star).group(1)) / 100
            comment_content_dict['star'] = star
            all_item_score_s = comment.xpath(
                'child::*/div[@class="num"]/span/text()').extract()
            for specific_score in all_item_score_s:
                key_value = specific_score.split(':')
                comment_content_dict[key_value[0]] = key_value[1]
            comment_content_dict['words'] = li.xpath(
                'child::*//div[@class="words"]/text()').extract_first()
            comment_content_dict['time'] = li.xpath(
                'child::*//div[@class="time"]/text()').extract_first()
            comment_content_dict['like'] = li.xpath(
                'child::*//div[@class="like"]/span/text()').extract_first()

            comments_in_page.append(comment_content_dict)

        # meta
        meta = res.meta
        item = meta['item']
        item['house_comment']['comments'] += (comments_in_page)

        current_page_str = res.xpath(
            '//div[@class="page-box"]/@data-current').extract_first()
        if current_page_str is None:
            yield from self.start_parse_news(meta)
        else:
            current_page = int(current_page_str)
            total_pages = ceil(
                int(
                    res.xpath('//div[@class="page-box"]/@data-total-count').
                    extract_first()) / 20.0)
            next_page_rul = meta['root_url'] + 'pinglun/pg' + str(
                current_page + 1)

            if current_page < total_pages:

                yield from ParseUtil.start_request(
                    next_page_rul,
                    CommentParser().parse_comments, meta)
            else:

                yield from self.start_parse_news(meta)
Exemplo n.º 8
0
 def start_parse_news(self, meta):
     news_url = meta['root_url'] + 'dongtai/'
     yield from ParseUtil.start_request(news_url, NewsParser().parse, meta)