예제 #1
0
파일: crawling.py 프로젝트: dyzsasd/winery
    def parse(self):
        url = self.url
        raw_name = [country.strip() for country
                    in self.tree.xpath('//h1/text()')]
        name = ''.join(raw_name)
        geo_query = ''.join(
            [query.strip() for query
             in self.tree.xpath('//*[@id="region-map"]/@data-query')])

        name_list = self.tree.xpath(
            '//div[@class="location"]//li//h3/a/text()')

        id_list = [
            href2id(href)
            for href in self.tree.xpath(
                '//div[@class="location"]//li//h3/a/@href')
        ]

        country_name = (name_list and name_list[0]) or ''
        country_id = (id_list and id_list[0]) or ''

        parent_name = (name_list[1:] and name_list[-1]) or ''
        parent_id = (id_list[1:] and id_list[-1]) or ''

        ancestor_region_names = '/'.join(name_list)
        ancestor_region_ids = '/'.join(id_list)

        niveau = len(name_list) or -1

        return Region(
            url=url,
            name=name,
            geo_query=geo_query,
            country_name=country_name,
            country_id=country_id,
            parent_name=parent_name,
            parent_id=parent_id,
            ancestor_region_names=ancestor_region_names,
            ancestor_region_ids=ancestor_region_ids,
            niveau=niveau
        )
예제 #2
0
파일: crawling.py 프로젝트: dyzsasd/winery
    def parse(self):
        url = self.url

        name_list = self.tree.xpath('//h1[@itemprop="name"]/span/text()')
        name = ' '.join(name_list[:-1])
        year = name_list[-1]

        raw_country_name = self.tree.xpath(
            '//a[@data-item-type="Country"]/text()'
        )
        country_name = (raw_country_name and raw_country_name[0]) or ''
        country_id = href2id(''.join(
            self.tree.xpath(
                '//a[@data-item-type="Country"]/@href'
            )
        ))
        raw_region_name = self.tree.xpath(
            '//a[@data-item-type="wine-region"]/text()'
        )
        region_name = (raw_region_name and raw_region_name[0]) or ''
        region_id = href2id(''.join(
            self.tree.xpath(
                '//a[@data-item-type="wine-region"]/@href'
            )
        ))
        raw_winery_name = self.tree.xpath(
            '//a[@data-item-type="winery"]/text()'
        )
        winery_name = (raw_winery_name and raw_winery_name[0]) or ''
        winery_id = href2id(''.join(
            self.tree.xpath(
                '//a[@data-item-type="winery"]/@href'
            )
        ))

        raw_rating = [
            chiffre for chiffre_text in
            self.tree.xpath(
                '//*[@data-track-type="wi"]'
                '//*[@itemprop="aggregateRating"]'
                '//*[@itemprop="ratingValue"]/text()')
            for chiffre in chiffre_regex.findall(chiffre_text)
        ]
        raw_count = [
            chiffre for chiffre_text in
            self.tree.xpath(
                '//*[@data-track-type="wi"]'
                '//*[@itemprop="price"]'
                '//*[@itemprop="ratingCount"]/text()')
            for chiffre in chiffre_regex.findall(chiffre_text)
        ]

        rating = (raw_rating and float(raw_rating[0].replace(',', '.'))) or -1
        count = (raw_count and float(raw_count[0].replace(',', '.'))) or -1

        raw_price = [
            chiffre for chiffre_text in
            self.tree.xpath(
                '//*[@data-track-type="wi"]'
                '//*[@itemprop="offers"]'
                '/*[@itemprop="price"]/text()')
            for chiffre in chiffre_regex.findall(chiffre_text)
        ]
        price = (raw_price and float(raw_price[0].replace(',', '.'))) or -1

        foods_pairings = [food.strip().replace(',', '') for food in self.tree.xpath(
            '//div[@class="row wine-information-entry"]'
            '//*[@data-item-type="food-pairing"]/text()')]

        raw_region_style_name = self.tree.xpath(
            '//a[@data-item-type="wine-style"]/text()'
        )
        region_style_name = (raw_region_style_name and raw_region_style_name[0]) or ''
        region_style_id = href2id(''.join(
            self.tree.xpath(
                '//a[@data-item-type="wine-style"]/@href'
            )
        ))

        grape_names = self.tree.xpath(
            '//div[@class="row wine-information-entry"]'
            '//a[@data-item-type="grape"]/text()')
        grape_ids = [href2id(href) for href in self.tree.xpath(
            '//div[@class="row wine-information-entry"]'
            '//a[@data-item-type="grape"]/@href')]

        return Win(
            url=url,
            name=name,
            year=year,
            country_name=country_name,
            country_id=country_id,
            region_name=region_name,
            region_id=region_id,
            winery_name=winery_name,
            winery_id=winery_id,
            rating_value=rating,
            rating_count=count,
            price=price,
            foods_pairings=foods_pairings,
            region_style_name=region_style_name,
            region_style_id=region_style_id,
            grape_names=grape_names,
            grape_ids=grape_ids
        )
예제 #3
0
파일: crawling.py 프로젝트: dyzsasd/winery
    def parse(self):
        url = self.url

        name_list = self.tree.xpath(
            '//div[@class="wine-style-main-content"]/h1/span//text()')
        name = ' '.join(name_list)

        raw_country_name = self.tree.xpath(
            '//div[@class="row wine-style-area"]/div[h5/text()="Country"]//a/text()'
        )
        country_name = (raw_country_name and raw_country_name[0]) or ''
        country_id = href2id(''.join(
            self.tree.xpath(
                '//div[@class="row wine-style-area"]/div[h5/text()="Country"]//a/@href'
            )
        ))
        raw_region_name = self.tree.xpath(
            '//div[@class="row wine-style-area"]/div[h5/text()="Region"]//a/text()'
        )
        region_name = (raw_region_name and raw_region_name[0]) or ''
        region_id = href2id(''.join(
            self.tree.xpath(
                '//div[@class="row wine-style-area"]/div[h5/text()="Region"]//a/@href'
            )
        ))

        foods_pairings = [
            food.strip().replace(',', '') for food in self.tree.xpath(
                '//div[@class="wine-information-item" and h3/text()="Food Pairing"]'
                '//li/text()'
            ) if re.findall(r'\w+', food)
        ]

        grape_names = self.tree.xpath(
            '//div[@class="wine-information-item" and h3/text()="Grapes"]'
            '//a/text()'
        )
        grape_ids = [href2id(href) for href in self.tree.xpath(
            '//div[@class="wine-information-item" and h3/text()="Grapes"]'
            '//a/@href')]

        try:
            acidity = float(''.join(self.tree.xpath(
                '//div[@class="wine-information-item" and h3/text()="Acidity"]'
                '//figure/@data-grape-acidity')))
        except Exception:
            acidity = -1.0

        try:
            body = float(self.tree.xpath(
                '//div[@class="wine-information-item" and h3/text()="Body"]'
                '//figure/@data-grape-body'))
        except Exception:
            body = -1.0

        description = ''.join(self.tree.xpath(
            '//div[@class="wine-style-description row visible-xs"]//text()'))

        return RegionStyle(
            url=url,
            name=name,
            country_name=country_name,
            country_id=country_id,
            region_name=region_name,
            region_id=region_id,
            food_pairings=foods_pairings,
            grape_names=grape_names,
            grape_ids=grape_ids,
            body=body,
            acidity=acidity,
            description=description
        )
예제 #4
0
파일: crawling.py 프로젝트: dyzsasd/winery
    def parse(self):
        url = self.url
        raw_name = [country.strip() for country
                    in self.tree.xpath('//h1/text()')]
        name = ''.join(raw_name)
        raw_country_name = self.tree.xpath(
            '//span[meta/@itemprop="addressCountry"]/a/text()'
        )
        country_name = (raw_country_name and raw_country_name[0]) or ''
        country_id = href2id(''.join(
            self.tree.xpath(
                '//span[meta/@itemprop="addressCountry"]/a/@href'
            )
        ))
        raw_region_name = self.tree.xpath(
            '//section[@class="main-content-section winery-information"]'
            '//span[@itemprop="addressRegion"]/a/text()'
        )
        region_name = (raw_region_name and raw_region_name[0]) or ''
        region_id = href2id(''.join(
            self.tree.xpath(
                '//section[@class="main-content-section winery-information"]'
                '//span[@itemprop="addressRegion"]/a/@href'
            )
        ))
        raw_rating = [
            chiffre for chiffre_text in
            self.tree.xpath('//*[@itemprop="ratingValue"]/text()')
            for chiffre in chiffre_regex.findall(chiffre_text)
        ]
        raw_count = [
            chiffre for chiffre_text in
            self.tree.xpath('//*[@itemprop="ratingCount"]/@content')
            for chiffre in chiffre_regex.findall(chiffre_text)
        ]

        rating = (raw_rating and float(raw_rating[0].replace(',', '.'))) or -1
        count = (raw_count and float(raw_count[0].replace(',', '.'))) or -1
        websites = self.tree.xpath(
            '/html/body/div[2]/section[1]/div'
            '/div[3]/div[1]/section[2]/div/div/a/@href')

        address = '\n'.join(self.tree.xpath(
            '//*[@itemprop="streetAddress" '
            'or @itemprop="postalCode" or '
            '@itemprop="addressLocality"]/text()'))

        win_maker = ''.join(self.tree.xpath('//*[@class="semi winemaker"]/text()'))

        description = ""

        return Winery(
            url=url,
            name=name,
            country_name=country_name,
            country_id=country_id,
            region_name=region_name,
            region_id=region_id,
            rating_value=rating,
            rating_count=count,
            address=address,
            websites=websites,
            win_maker=win_maker,
            description=description
        )