Python get_parsed_string示例，tripadvisorcom_scraper.spiders.crawlerhelper.get_parsed_string Python示例

示例#1

0

显示文件

文件： hotelscrawler.py 项目： arunpn/tripadvisorcom_scraper

    def parse(self, response):

        url_start = 'http://www.tripadvisor.com'
        hxs = HtmlXPathSelector(response)

        # Parse the page for hotels and yield them
        # if the page is a city page

        hotel_urls = hxs.select('//a[contains(@class, "property_title")]/@href').extract()

        if hotel_urls:
            for hotel_url in hotel_urls:
                yield Request(url_start + hotel_url, self.parse)

        # Parse for the next button and yield the next city page
        # The next button is available both on the top and bottom
        # of the page. Yield any one of those
        # I chose the bottom link. Feels more... comfortable ;)

        next_page_url = clean_parsed_string(get_parsed_string(
            hxs, '//div[contains(@id, "pager_bottom")]//a[contains(@class, "guiArw sprite-pageNext  pid0")]/@href'))
        if next_page_url and len(next_page_url) > 0:
            next_page = url_start + next_page_url
            yield Request(next_page, self.parse)

        # If the page itself is a hotels page, get the details and
        # return the hotel item

        if response.url.find('/Hotel_Review') != -1:

            hi = HotelItem()

            hi['item_type'] = 'hotel'
            hi['hotel_id'] = re.search('d[0-9]+', response.url).group(0)
            hi['name'] = clean_parsed_string(get_parsed_string(
                hxs, '//h1[contains(@id, "HEADING")]/text()'))
            hi['locality'] = clean_parsed_string(get_parsed_string(
                hxs, '//div[contains(@class, "wrap infoBox")]//span[contains(@property, "v:locality")]/text()'))
            hi['region'] = clean_parsed_string(get_parsed_string(
                hxs, '//div[contains(@class, "wrap infoBox")]//span[contains(@property, "v:region")]/text()'))
            hi['postal_code'] = clean_parsed_string(get_parsed_string(
                hxs, '//div[contains(@class, "wrap infoBox")]//span[contains(@property, "v:postal-code")]/text()'))
            hi['country'] = clean_parsed_string(get_parsed_string(
                hxs, '//div[contains(@class, "wrap infoBox")]//span[contains(@property, "v:country-name")]/text()'))
            rating_string = clean_parsed_string(get_parsed_string(
                hxs, '//div[contains(@rel, "v:rating")]//img[contains(@class, "sprite-ratings")]/@alt'))
            review_count = clean_parsed_string(get_parsed_string(
                hxs, '//div[contains(@class, "rs rating")]//span[contains(@property, "v:count")]/text()'))
            price_range = clean_parsed_string(get_parsed_string(
                hxs, '//span[contains(@property, "v:pricerange")]/text()'))
            hi['price_range'] = len(price_range) if price_range else None
            # Some review counts are written '1 review' instead of just '1123'
            # So split the numerical part and convert into integer
            hi['review_count'] = int(review_count.split()[0]) if review_count else None
            hi['rating'] = float(re.search('[0-9].[0-9]', rating_string).group(0)) if rating_string else None
            hi['url'] = response.url

            print hi['name']

            yield hi

示例#2

0

显示文件

    def parse(self, response):

        url_start = 'http://www.tripadvisor.com'
        hxs = HtmlXPathSelector(response)

        # The locations may or may not contain sub-locations

        sub_urls = hxs.select('//div[contains(@class, "rolluptopdestlink")]/a/@href').extract()

        # If the page contain sub-locations request all the
        # sub-locations and yield the next page if available

        if sub_urls:
            for sub_url in sub_urls:
                city_url = url_start + sub_url
                yield Request(city_url, self.parse)

            # Now comes the next-page part
            next_page_url = clean_parsed_string(get_parsed_string(
                hxs, '//a[contains(@class, "guiArw sprite-pageNext")]/@href'))
            if next_page_url and len(next_page_url) > 0:
                next_page = url_start + next_page_url
                yield Request(next_page, self.parse)

        # If no sub-locations are present, return the CityItem
        else:
            url = response.url
            city = clean_parsed_string(get_parsed_string(
                hxs, '//h1[contains(@class, "header")]/text()'))
            geo_id = re.search('g[0-9]+', url).group(0)

            city = city.split('Hotels')[0].strip() if city else None

            print city

            ci = CityItem()
            ci['item_type'] = 'city'
            ci['city'] = city
            ci['geo_id'] = geo_id
            ci['url'] = url

            yield ci

示例#3

0

显示文件

文件： reviewscrawler.py 项目： arunpn/tripadvisorcom_scraper

    def parse(self, response):

        url_start = 'http://www.tripadvisor.com'
        hxs = HtmlXPathSelector(response)

        # The default hotels page contains the reviews
        # but the reviews are shrunk and need to click
        # 'more' to view the complete content. An alternate
        # way is to click one of the reviews in the page

        review_url = clean_parsed_string(get_parsed_string(
            hxs, '//div[contains(@class, "basic_review first")]//a/@href'))

        if review_url:
            yield Request(url_start + review_url, self.parse)

        # If the page is not a basic review page, we can proceed with
        # parsing the reviews

        else:
            raw_reviews = hxs.select('//div[contains(@class, "review extended")]')
            for raw_review in raw_reviews:
                ri = ReviewItem()
                ri['item_type'] = 'review'
                ri['hotel_id'] = re.search('d[0-9]+', response.url).group(0)
                ri['review_id'] = clean_parsed_string(get_parsed_review_element(
                    raw_review, '@id'))
                rdate_text = clean_parsed_string(get_parsed_review_element(
                    raw_review, 'div//span[contains(@class, "ratingDate")]/text()'))
                rdate_text = rdate_text.split('Reviewed')[1].strip() if rdate_text else None
                rdate = time.strptime(rdate_text, '%B %d, %Y') if rdate_text else None
                ri['review_date'] = time.strftime('%Y-%m-%d', rdate) if rdate else None
                ri['reviewer_type'] = None # TODO: Try to find the info and insert here
                ri['summary'] = clean_parsed_string(get_parsed_review_element(
                    raw_review, 'div//div[contains(@class, "quote")]/text()'))
                ri['reviewer_name'] = clean_parsed_string(get_parsed_review_element(
                    raw_review, 'div//div[contains(@class, "username mo")]/span/text()'))
                reviewer_rcount = clean_parsed_string(get_parsed_review_element(
                    raw_review, 'div//div[contains(@class, "totalReviewBadge")]//span[contains(@class, "badgeText")]/text()'))
                ri['reviewer_rcount'] = int(reviewer_rcount.split()[0]) if reviewer_rcount else None
                reviewer_locality = clean_parsed_string(get_parsed_review_element(
                    raw_review, 'div//div[contains(@class, "member_info")]//div[contains(@class, "location")]/text()'))
                ri['reviewer_locality'] = reviewer_locality.title() if reviewer_locality else None
                ri['content'] = clean_parsed_string(get_parsed_review_element(
                    raw_review, 'div//div[contains(@class, "entry")]//p'))
                rating_text = clean_parsed_string(get_parsed_review_element(
                    raw_review, 'div//div[contains(@class, "rating reviewItemInline")]//img/@alt'))
                ri['rating'] = int(rating_text.split()[0]) if rating_text else None
                ri['recommendations'] = raw_review.select('div//li[contains(@class, "recommend-answer")]').extract()

                print '%s:%s:%s' % (ri['review_id'], ri['reviewer_name'], ri['review_date'])

                yield ri

        # Find the next page link if available and yield it

        next_page_url = clean_parsed_string(get_parsed_string(
            hxs, '//a[contains(@class, "guiArw sprite-pageNext")]/@href'))
        if next_page_url and len(next_page_url) > 0:
            next_page = url_start + next_page_url
            yield Request(next_page, self.parse)

示例#4

0

显示文件

    def parse(self, response):

        url_start = 'http://www.tripadvisor.com'
        hxs = HtmlXPathSelector(response)

        # The default hotels page contains the reviews
        # but the reviews are shrunk and need to click
        # 'more' to view the complete content. An alternate
        # way is to click one of the reviews in the page

        review_url = clean_parsed_string(
            get_parsed_string(
                hxs, '//div[contains(@class, "basic_review first")]//a/@href'))

        if review_url:
            yield Request(url_start + review_url, self.parse)

        # If the page is not a basic review page, we can proceed with
        # parsing the reviews

        else:
            raw_reviews = hxs.select(
                '//div[contains(@class, "review extended")]')
            for raw_review in raw_reviews:
                ri = ReviewItem()
                ri['item_type'] = 'review'
                ri['hotel_id'] = re.search('d[0-9]+', response.url).group(0)
                ri['review_id'] = clean_parsed_string(
                    get_parsed_review_element(raw_review, '@id'))
                rdate_text = clean_parsed_string(
                    get_parsed_review_element(
                        raw_review,
                        'div//span[contains(@class, "ratingDate")]/text()'))
                rdate_text = rdate_text.split(
                    'Reviewed')[1].strip() if rdate_text else None
                rdate = time.strptime(rdate_text,
                                      '%B %d, %Y') if rdate_text else None
                ri['review_date'] = time.strftime('%Y-%m-%d',
                                                  rdate) if rdate else None
                ri['reviewer_type'] = None  # TODO: Try to find the info and insert here
                ri['summary'] = clean_parsed_string(
                    get_parsed_review_element(
                        raw_review,
                        'div//div[contains(@class, "quote")]/text()'))
                ri['reviewer_name'] = clean_parsed_string(
                    get_parsed_review_element(
                        raw_review,
                        'div//div[contains(@class, "username mo")]/span/text()'
                    ))
                reviewer_rcount = clean_parsed_string(
                    get_parsed_review_element(
                        raw_review,
                        'div//div[contains(@class, "totalReviewBadge")]//span[contains(@class, "badgeText")]/text()'
                    ))
                ri['reviewer_rcount'] = int(
                    reviewer_rcount.split()[0]) if reviewer_rcount else None
                reviewer_locality = clean_parsed_string(
                    get_parsed_review_element(
                        raw_review,
                        'div//div[contains(@class, "member_info")]//div[contains(@class, "location")]/text()'
                    ))
                ri['reviewer_locality'] = reviewer_locality.title(
                ) if reviewer_locality else None
                ri['content'] = clean_parsed_string(
                    get_parsed_review_element(
                        raw_review, 'div//div[contains(@class, "entry")]//p'))
                rating_text = clean_parsed_string(
                    get_parsed_review_element(
                        raw_review,
                        'div//div[contains(@class, "rating reviewItemInline")]//img/@alt'
                    ))
                ri['rating'] = int(
                    rating_text.split()[0]) if rating_text else None
                ri['recommendations'] = raw_review.select(
                    'div//li[contains(@class, "recommend-answer")]').extract()

                print '%s:%s:%s' % (ri['review_id'], ri['reviewer_name'],
                                    ri['review_date'])

                yield ri

        # Find the next page link if available and yield it

        next_page_url = clean_parsed_string(
            get_parsed_string(
                hxs, '//a[contains(@class, "guiArw sprite-pageNext")]/@href'))
        if next_page_url and len(next_page_url) > 0:
            next_page = url_start + next_page_url
            yield Request(next_page, self.parse)

示例#5

0

显示文件

    def parse(self, response):

        url_start = 'http://www.tripadvisor.com'
        hxs = HtmlXPathSelector(response)

        # Parse the page for hotels and yield them
        # if the page is a city page

        hotel_urls = hxs.select(
            '//a[contains(@class, "property_title")]/@href').extract()

        if hotel_urls:
            for hotel_url in hotel_urls:
                yield Request(url_start + hotel_url, self.parse)

        # Parse for the next button and yield the next city page
        # The next button is available both on the top and bottom
        # of the page. Yield any one of those
        # I chose the bottom link. Feels more... comfortable ;)

        next_page_url = clean_parsed_string(
            get_parsed_string(
                hxs,
                '//div[contains(@id, "pager_bottom")]//a[contains(@class, "guiArw sprite-pageNext  pid0")]/@href'
            ))
        if next_page_url and len(next_page_url) > 0:
            next_page = url_start + next_page_url
            yield Request(next_page, self.parse)

        # If the page itself is a hotels page, get the details and
        # return the hotel item

        if response.url.find('/Hotel_Review') != -1:

            hi = HotelItem()

            hi['item_type'] = 'hotel'
            hi['hotel_id'] = re.search('d[0-9]+', response.url).group(0)
            hi['name'] = clean_parsed_string(
                get_parsed_string(hxs,
                                  '//h1[contains(@id, "HEADING")]/text()'))
            hi['locality'] = clean_parsed_string(
                get_parsed_string(
                    hxs,
                    '//div[contains(@class, "wrap infoBox")]//span[contains(@property, "v:locality")]/text()'
                ))
            hi['region'] = clean_parsed_string(
                get_parsed_string(
                    hxs,
                    '//div[contains(@class, "wrap infoBox")]//span[contains(@property, "v:region")]/text()'
                ))
            hi['postal_code'] = clean_parsed_string(
                get_parsed_string(
                    hxs,
                    '//div[contains(@class, "wrap infoBox")]//span[contains(@property, "v:postal-code")]/text()'
                ))
            hi['country'] = clean_parsed_string(
                get_parsed_string(
                    hxs,
                    '//div[contains(@class, "wrap infoBox")]//span[contains(@property, "v:country-name")]/text()'
                ))
            rating_string = clean_parsed_string(
                get_parsed_string(
                    hxs,
                    '//div[contains(@rel, "v:rating")]//img[contains(@class, "sprite-ratings")]/@alt'
                ))
            review_count = clean_parsed_string(
                get_parsed_string(
                    hxs,
                    '//div[contains(@class, "rs rating")]//span[contains(@property, "v:count")]/text()'
                ))
            price_range = clean_parsed_string(
                get_parsed_string(
                    hxs, '//span[contains(@property, "v:pricerange")]/text()'))
            hi['price_range'] = len(price_range) if price_range else None
            # Some review counts are written '1 review' instead of just '1123'
            # So split the numerical part and convert into integer
            hi['review_count'] = int(
                review_count.split()[0]) if review_count else None
            hi['rating'] = float(
                re.search('[0-9].[0-9]',
                          rating_string).group(0)) if rating_string else None
            hi['url'] = response.url

            print hi['name']

            yield hi