Python _get_soup示例，soup._get_soup Python示例

示例#1

0

显示文件

    def get_beers(self):
        """Generator that provides Beer objects for the brewery's beers"""
        if not self._has_fetched:
            self._populate()

        _id = self.url.split('/')[-2]
        complete_url = u'/Ratings/Beer/ShowBrewerBeers.asp?BrewerID={0}'.format(
            _id)
        soup = soup_helper._get_soup(complete_url)
        soup_beer_rows = soup.find('table',
                                   id='brewer-beer-table').findAll('tr')

        for row in soup_beer_rows[1:]:
            url = row.a.get('href')
            # Only return rows that are ratable
            if not row.find('a', title="Rate this beer"):
                continue
            # Remove any whitespace characters. Rare, but possible.
            url = re.sub(r"\s+", "", url, flags=re.UNICODE)
            beer = Beer(url)
            beer.name = row.a.text.strip()
            # Add attributes from row
            abv = row.findAll('td')[1].text
            weighted_avg = row.findAll('td')[4].text.strip()
            style_rating = row.findAll('td')[5].text.strip()
            num_ratings = row.findAll('td')[6].text.strip()
            if abv:
                beer.abv = float(abv)
            if weighted_avg:
                beer.weighted_avg = float(weighted_avg)
            if style_rating:
                beer.style_rating = int(style_rating)
            if num_ratings:
                beer.num_ratings = int(num_ratings)
            yield beer

示例#2

0

显示文件

文件： models.py 项目： shamrt/ratebeer

    def _populate(self):
        """Returns information about a specific brewery.

        Args:
            url (string): The specific url of the beer. Looks like:
                "/brewers/new-belgium-brewing-company/77/"

        Returns:
            A dictionary of attributes about that brewery."""

        soup = soup_helper._get_soup(self.url)
        s_contents = soup.find_all(
            'div', {'itemtype': 'http://schema.org/LocalBusiness'})
        if not s_contents:
            raise rb_exceptions.PageNotFound(self.url)

        self.name = soup.h1.text
        self.type = s_contents[0].find_all('div')[1].text.strip()
        website = s_contents[0].\
            find_all('div', class_='media-links')[0].find_all('a')[0]
        if website:
            self.web = website['href']
        self.telephone = Brewery._find_span(s_contents[0], 'telephone')
        self.street = Brewery._find_span(s_contents[0], 'streetAddress')
        self.city = Brewery._find_span(s_contents[0], 'addressLocality')
        self.state = Brewery._find_span(s_contents[0], 'addressRegion')
        self.country = Brewery._find_span(s_contents[0], 'addressCountry')
        self.postal_code = Brewery._find_span(s_contents[0], 'postalCode')
        self._has_fetched = True

        return self

示例#3

0

显示文件

文件： models.py 项目： shamrt/ratebeer

    def get_beers(self):
        """Generator that provides Beer objects for the brewery's beers"""
        if not self._has_fetched:
            self._populate()

        _id = self.url.split('/')[-2]
        complete_url = u'/Ratings/Beer/ShowBrewerBeers.asp?BrewerID={0}'.\
            format(_id)
        soup = soup_helper._get_soup(complete_url)
        soup_beer_rows = soup.find(
            'table', id='brewer-beer-table').findAll('tr')

        for row in soup_beer_rows[1:]:
            url = row.a.get('href')
            # Only return rows that are ratable
            if not row.find('a', title="Rate this beer"):
                continue
            # Remove any whitespace characters. Rare, but possible.
            url = re.sub(r"\s+", "", url, flags=re.UNICODE)
            beer = Beer(url)
            beer.name = row.a.text.strip()
            # Add attributes from row
            abv = row.findAll('td')[1].text
            weighted_avg = row.findAll('td')[4].text.strip()
            style_rating = row.findAll('td')[5].text.strip()
            num_ratings = row.findAll('td')[6].text.strip()
            if abv:
                beer.abv = float(abv)
            if weighted_avg:
                beer.weighted_avg = float(weighted_avg)
            if style_rating:
                beer.style_rating = int(style_rating)
            if num_ratings:
                beer.num_ratings = int(num_ratings)
            yield beer

示例#4

0

显示文件

    def _populate(self):
        """Returns information about a specific brewery.

        Args:
            url (string): The specific url of the beer. Looks like:
                "/brewers/new-belgium-brewing-company/77/"

        Returns:
            A dictionary of attributes about that brewery."""

        soup = soup_helper._get_soup(self.url)
        s_contents = soup.find_all(
            'div', {'itemtype': 'http://schema.org/LocalBusiness'})
        if not s_contents:
            raise rb_exceptions.PageNotFound(self.url)

        self.name = soup.h1.text
        self.type = s_contents[0].find_all('div')[1].text.strip()
        website = s_contents[0].find_all(
            'div', {'class': 'media-links'})[0].find_all('a')[0]
        if website:
            self.web = website['href']
        self.telephone = Brewery._find_span(s_contents[0], 'telephone')
        self.street = Brewery._find_span(s_contents[0], 'streetAddress')
        self.city = Brewery._find_span(s_contents[0], 'addressLocality')
        self.state = Brewery._find_span(s_contents[0], 'addressRegion')
        self.country = Brewery._find_span(s_contents[0], 'addressCountry')
        self.postal_code = Brewery._find_span(s_contents[0], 'postalCode')
        self._has_fetched = True

        return self

示例#5

0

显示文件

    def _populate(self):
        """Returns information about a specific brewery.

        Args:
            url (string): The specific url of the beer. Looks like:
                "/brewers/new-belgium-brewing-company/77/"

        Returns:
            A dictionary of attributes about that brewery."""

        soup = soup_helper._get_soup(self.url)
        try:
            s_contents = soup.find(
                'div',
                id='container').find('table').find_all('tr')[0].find_all('td')
        except AttributeError:
            raise rb_exceptions.PageNotFound(self.url)

        self.name = soup.h1.text
        self.type = re.findall(r'Type: (.*?)<br\/>',
                               soup.decode_contents())[0].strip()
        if soup.find_all(string='Web: '):
            self.web = soup.find_all(string='Web: ')[0].find_next()['href']
        self.telephone = Brewery._find_span(s_contents[0], 'telephone')
        self.street = Brewery._find_span(s_contents[0], 'streetAddress')
        self.city = Brewery._find_span(s_contents[0], 'addressLocality')
        self.state = Brewery._find_span(s_contents[0], 'addressRegion')
        self.country = Brewery._find_span(s_contents[0], 'addressCountry')
        self.postal_code = Brewery._find_span(s_contents[0], 'postalCode')
        self._has_fetched = True

        return self

示例#6

0

显示文件

文件： ratebeer.py 项目： parryc/record.beer

    def beer_style(self, ident, sort_type=None, sort_order=None):
        """Get all the beers from a specific beer style page.

        Args:
            ident (integer): The ID of the beer style from beer_style_list().
                For example, for 'Abbey Dubbel' it would be 71.
            sort_type (string): The sorting of the results. The valid choices
                are "score" (default), "count", and "abv".
            sort_order (string): "ascending" (low-to-high) or
                "descending" (high-to-low, default)

        Returns:
            A list of generator of beers.
        """
        if sort_type is None:
            sort_type = 'score'
        if sort_order is None:
            sort_order = 'descending'
        sort_type = sort_type.lower()
        sort_order = sort_order.lower()
        so = {'score': 0, 'count': 1, 'abv': 2}.get(sort_type)
        o = {'descending': 0, 'ascending': 1}.get(sort_order)

        soup = soup_helper._get_soup('/ajax/top-beer.asp?s={}&so={}&o={}'.format(ident, so, o))
        rows = iter(soup.table.find_all('tr'))
        next(rows)  # Get rid of the header
        for row in rows:
            data = row.find_all('td')
            link = data[1].a
            dataout = models.Beer(link.get('href'))
            dataout.name = link.text
            yield dataout

示例#7

0

显示文件

文件： models.py 项目： acf5118/ratebeer

    def _populate(self):
        """Returns information about a specific brewery.

        Args:
            url (string): The specific url of the beer. Looks like:
                "/brewers/new-belgium-brewing-company/77/"

        Returns:
            A dictionary of attributes about that brewery."""

        soup = soup_helper._get_soup(self.url)
        try:
            s_contents = soup.find('div', id='container').find('table').find_all('tr')[0].find_all('td')
        except AttributeError:
            raise rb_exceptions.PageNotFound(self.url)

        self.name = soup.h1.text
        self.type = re.findall(r'Type: (.*?)<br\/>', soup.decode_contents())[0].strip()
        if soup.find_all(string='Web: '):
            self.web = soup.find_all(string='Web: ')[0].find_next()['href']
        self.telephone = Brewery._find_span(s_contents[0], 'telephone')
        self.street = Brewery._find_span(s_contents[0], 'streetAddress')
        self.city = Brewery._find_span(s_contents[0], 'addressLocality')
        self.state = Brewery._find_span(s_contents[0], 'addressRegion')
        self.country = Brewery._find_span(s_contents[0], 'addressCountry')
        self.postal_code = Brewery._find_span(s_contents[0], 'postalCode')
        self._has_fetched = True

        return self

示例#8

0

显示文件

    def beer_style(self, ident, sort_type=None, sort_order=None):
        """Get all the beers from a specific beer style page.

        Args:
            ident (integer): The ID of the beer style from beer_style_list().
                For example, for 'Abbey Dubbel' it would be 71.
            sort_type (string): The sorting of the results. The valid choices
                are "score" (default), "count", and "abv".
            sort_order (string): "ascending" (low-to-high) or
                "descending" (high-to-low, default)

        Returns:
            A list of generator of beers.
        """
        if sort_type is None:
            sort_type = 'score'
        if sort_order is None:
            sort_order = 'descending'
        sort_type = sort_type.lower()
        sort_order = sort_order.lower()
        so = {'score': 0, 'count': 1, 'abv': 2}.get(sort_type)
        o = {'descending': 0, 'ascending': 1}.get(sort_order)

        soup = soup_helper._get_soup('/ajax/top-beer.asp?s={}&so={}&o={}'.format(ident, so, o))
        rows = iter(soup.table.find_all('tr'))
        next(rows)  # Get rid of the header
        for row in rows:
            data = row.find_all('td')
            link = data[1].a
            dataout = models.Beer(link.get('href'))
            dataout.name = link.text
            yield dataout

示例#9

0

显示文件

文件： models.py 项目： ivandjuricic/ratebeer

    def _populate(self):
        """Returns information about a specific brewery.

        Args:
            url (string): The specific url of the beer. Looks like:
                "/brewers/new-belgium-brewing-company/77/"

        Returns:
            A dictionary of attributes about that brewery."""

        soup = soup_helper._get_soup(self.url)
        try:
            s_contents = soup.find("div", id="container").find("table").find_all("tr")[0].find_all("td")
        except AttributeError:
            raise rb_exceptions.PageNotFound(self.url)

        self.name = soup.h1.text
        self.type = re.findall(r"Type: (.*?)<br\/>", soup.decode_contents())[0].strip()
        if soup.find_all(string="Web: "):
            self.web = soup.find_all(string="Web: ")[0].find_next()["href"]
        self.telephone = Brewery._find_span(s_contents[0], "telephone")
        self.street = Brewery._find_span(s_contents[0], "streetAddress")
        self.city = Brewery._find_span(s_contents[0], "addressLocality")
        self.state = Brewery._find_span(s_contents[0], "addressRegion")
        self.country = Brewery._find_span(s_contents[0], "addressCountry")
        self.postal_code = Brewery._find_span(s_contents[0], "postalCode")
        self._has_fetched = True

        return self

示例#10

0

显示文件

文件： ratebeer.py 项目： parryc/record.beer

    def beer_style_list(self):
        """Returns the beer styles from the beer styles page.

        Returns:
            A dictionary, with beer styles strings for keys and integer ids
            for values.
        """
        styles = {}
        soup = soup_helper._get_soup("/top/")
        for item in [i for i in soup.find('select', id="StyleMenu").find_all('option') if i.get('name')]:
            styles[item.text.strip()] = int(item.get('value'))
        return styles

示例#11

0

显示文件

    def beer_style_list(self):
        """Returns the beer styles from the beer styles page.

        Returns:
            A dictionary, with beer styles strings for keys and integer ids
            for values.
        """
        styles = {}
        soup = soup_helper._get_soup("/top/")
        for item in [i for i in soup.find('select', id="StyleMenu").find_all('option') if i.get('name')]:
            styles[item.text.strip()] = int(item.get('value'))
        return styles

示例#12

0

显示文件

文件： ratebeer.py 项目： phelian/ratebeer

    def beer_style_list(self):
        """Returns the beer styles from the beer styles page.

        Returns:
            A dictionary, with beer styles for keys and urls for values.
        """
        styles = {}

        soup = soup_helper._get_soup("/beerstyles/")
        columns = soup.find_all('table')[2].find_all('td')
        for column in columns:
            lines = [li for li in column.find_all('li')]
            for line in lines:
                styles[line.text] = line.a.get('href')
        return styles

示例#13

0

显示文件

文件： models.py 项目： shamrt/ratebeer

    def get_reviews(self, review_order="most recent"):
        """Returns reviews for a specific beer.

        Args:
            url (string): The specific url of the beer. Looks like:
                "/beer/deschutes-inversion-ipa/55610/"
            review_order (string): How to sort reviews. Three inputs:
                most recent: Newer reviews appear earlier.
                top raters: RateBeer.com top raters appear earlier.
                highest score: Reviews with the highest overall score appear
                earlier.

        Returns:
            A generator of dictionaries, containing the information about the
            review.
        """

        if not self._has_fetched:
            self._populate()

        review_order = review_order.lower()
        url_codes = {
            "most recent": 1,
            "top raters": 2,
            "highest score": 3
        }
        url_flag = url_codes.get(review_order)
        if not url_flag:
            raise ValueError("Invalid ``review_order``.")

        page_number = 1
        while True:
            complete_url = u'{0}{1}/{2}/'.format(
                self.url, url_flag, page_number)
            soup = soup_helper._get_soup(complete_url)
            reviews_container = soup.find('div', class_='reviews-container')
            reviews = reviews_container.find_all(
                'div', style='padding: 0px 0px 0px 0px;')
            if len(reviews) < 1:
                raise StopIteration

            for review_soup in reviews:
                yield Review(review_soup)

            page_number += 1

示例#14

0

显示文件

    def get_beers(self):
        """Generator that provides Beer objects for the brewery's beers"""
        if not self._has_fetched:
            self._populate()

        page_number = 1
        while True:
            complete_url = u'{0}0/{1}/'.format(self.url, page_number)
            soup = soup_helper._get_soup(complete_url)
            soup_beer_rows = soup.find('table',
                                       'maintable nohover').findAll('tr')

            if len(soup_beer_rows) < 2:
                raise StopIteration

            for row in soup_beer_rows[1:]:
                url = row.a.get('href')
                # Only return rows that are ratable
                if not row.find(class_='rate'):
                    continue
                # Remove any whitespace characters. Rare, but possible.
                url = re.sub(r"\s+", "", url, flags=re.UNICODE)
                beer = Beer(url)
                beer.name = row.a.text.strip()
                # Add attributes from row
                abv = row.findAll('td')[2].text
                weighted_avg = row.findAll('td')[3].text.strip()
                overall_rating = row.findAll('td')[4].text.strip()
                style_rating = row.findAll('td')[5].text.strip()
                num_ratings = row.findAll('td')[6].text.strip()
                if abv:
                    beer.abv = float(abv)
                if weighted_avg:
                    beer.weighted_avg = float(weighted_avg)
                if overall_rating:
                    beer.overall_rating = int(overall_rating)
                if style_rating:
                    beer.style_rating = int(style_rating)
                if num_ratings:
                    beer.num_ratings = int(num_ratings)
                yield beer

            page_number += 1

示例#15

0

显示文件

文件： models.py 项目： ivandjuricic/ratebeer

    def get_beers(self):
        """Generator that provides Beer objects for the brewery's beers"""
        if not self._has_fetched:
            self._populate()

        page_number = 1
        while True:
            complete_url = u"{0}0/{1}/".format(self.url, page_number)
            soup = soup_helper._get_soup(complete_url)
            soup_beer_rows = soup.find("table", "maintable nohover").findAll("tr")

            if len(soup_beer_rows) < 2:
                raise StopIteration

            for row in soup_beer_rows[1:]:
                url = row.a.get("href")
                # Only return rows that are ratable
                if not row.find(class_="rate"):
                    continue
                # Remove any whitespace characters. Rare, but possible.
                url = re.sub(r"\s+", "", url, flags=re.UNICODE)
                beer = Beer(url)
                beer.name = row.a.text.strip()
                # Add attributes from row
                abv = row.findAll("td")[2].text
                weighted_avg = row.findAll("td")[3].text.strip()
                overall_rating = row.findAll("td")[4].text.strip()
                style_rating = row.findAll("td")[5].text.strip()
                num_ratings = row.findAll("td")[6].text.strip()
                if abv:
                    beer.abv = float(abv)
                if weighted_avg:
                    beer.weighted_avg = float(weighted_avg)
                if overall_rating:
                    beer.overall_rating = int(overall_rating)
                if style_rating:
                    beer.style_rating = int(style_rating)
                if num_ratings:
                    beer.num_ratings = int(num_ratings)
                yield beer

            page_number += 1

示例#16

0

显示文件

文件： models.py 项目： fpierfed/ratebeer

    def get_reviews(self, review_order="most recent"):
        """Returns reviews for a specific beer.

        Args:
            url (string): The specific url of the beer. Looks like:
                "/beer/deschutes-inversion-ipa/55610/"
            review_order (string): How to sort reviews. Three inputs:
                most recent: Newer reviews appear earlier.
                top raters: RateBeer.com top raters appear earlier.
                highest score: Reviews with the highest overall score appear
                earlier.

        Returns:
            A generator of dictionaries, containing the information about the review.
        """

        if not self._has_fetched:
            self._populate()

        review_order = review_order.lower()
        url_codes = {
            "most recent": 1,
            "top raters": 2,
            "highest score": 3
        }
        url_flag = url_codes.get(review_order)
        if not url_flag:
            raise ValueError("Invalid ``review_order``.")

        page_number = 1
        while True:
            complete_url = u'{0}{1}/{2}/'.format(self.url, url_flag, page_number)
            soup = soup_helper._get_soup(complete_url)
            content = soup.find('div', class_='reviews-container')
            reviews = content.find_all('div', style='padding: 0px 0px 0px 0px;')
            if len(reviews) < 1:
                raise StopIteration

            for review_soup in reviews:
                yield Review(review_soup)

            page_number += 1

示例#17

0

显示文件

文件： models.py 项目： Surye/ratebeer

    def get_beers(self):
        page_number = 1
        while True:
            complete_url = u'{0}0/{1}/'.format(self.url, page_number)
            soup = soup_helper._get_soup(complete_url)
            soup_beer_rows = soup.find('table', 'maintable nohover').findAll('tr')

            if len(soup_beer_rows) < 2:
                raise StopIteration

            for row in soup_beer_rows[1:]:
                url = row.a.get('href')
                # Only return rows that are ratable
                if not row.find(class_='rate'):
                    continue
                # sometimes the beer is listed but it doesn't have a page
                # ignore it for now
                try:
                    beer = Beer(url)
                except rb_exceptions.PageNotFound:
                    continue
                yield beer

            page_number += 1

示例#18

0

显示文件

    def _populate(self):
        soup = soup_helper._get_soup(self.url)
        # check for 404s
        try:
            soup_rows = soup.find('div',
                                  id='container').find('table').find_all('tr')
        except AttributeError:
            raise rb_exceptions.PageNotFound(self.url)
        # ratebeer pages don't actually 404, they just send you to this weird
        # "beer reference" page but the url doesn't actually change, it just
        # seems like it's all getting done server side -- so we have to look
        # for the contents h1 to see if we're looking at the beer reference or
        # not
        # if "beer reference" in soup_rows[0].find_all('td')[1].h1.contents:
        #     raise rb_exceptions.PageNotFound(self.url)

        # if "Also known as " in soup_rows[1].find_all('td')[1].div.div.contents:
        #     raise rb_exceptions.AliasedBeer(self.url, soup_rows[1].find_all('td')[1].div.div.a['href'])

        if soup_rows is None:
            raise rb_exceptions.PageNotFound(self.url)

        # General information from the top of the page
        self.name = soup.find(itemprop='name').text.strip()
        breweries = soup.find_all('a', href=re.compile('brewers'))
        self.brewery = Brewery(breweries[1].get('href'))
        self.brewery.name = breweries[1].text
        if len(breweries) == 3:
            self.brewed_at = Brewery(breweries[2].get('href'))
            self.brewed_at.name = breweries[2].text
        else:
            self.brewed_at = None
        try:
            self.overall_rating = int(
                soup.find('span',
                          text='overall').next_sibling.next_sibling.text)
        except ValueError:  # 'n/a'
            self.overall_rating = None
        except AttributeError:
            self.overall_rating = None
        try:
            self.style_rating = int(
                soup.find('span',
                          text='style').previous_sibling.previous_sibling)
        except ValueError:  # 'n/a'
            self.style_rating = None
        except AttributeError:
            self.style_rating = None
        self.style = soup.find(text='Style: ').next_sibling.text
        self.style_url = soup.find(text='Style: ').next_sibling.get('href')
        self.img_url = soup.find(id="beerImg").get('src')
        # Data from the info bar
        self.num_ratings = int(soup.find('span', itemprop="ratingCount").text)
        try:
            self.mean_rating = float(
                soup.find(text='MEAN: ').next_sibling.text.split('/')[0])
        except ValueError:  # Empty mean rating: '/5.0'
            self.mean_rating = None
        except AttributeError:  # No mean rating
            self.mean_rating = None
        try:
            self.weighted_avg = float(
                soup.find(attrs={
                    "name": "real average"
                }).find('span', itemprop="ratingValue").text)
        except ValueError:  # Empty weighted average rating: '/5'
            self.weighted_avg = None
        except AttributeError:  # No weighted average rating
            self.weighted_avg = None
        try:
            self.seasonal = soup.find(
                text=u'\xa0\xa0 SEASONAL: ').next_sibling.text
        except AttributeError:
            self.seasonal = None
        try:
            self.ibu = int(
                soup.find(
                    title="International Bittering Units - Normally from hops"
                ).next_sibling.next_sibling.text)
        except AttributeError:
            self.ibu = None
        try:
            self.calories = int(
                soup.find(
                    title="Estimated calories for a 12 fluid ounce serving").
                next_sibling.next_sibling.text)
        except AttributeError:
            self.calories = None
        try:
            self.abv = float(
                soup.find(title="Alcohol By Volume").next_sibling.next_sibling.
                text[:-1])
        except ValueError:  # Empty ABV: '-'
            self.abv = None
        if soup.find(title="Currently out of production"):
            self.retired = True
        else:
            self.retired = False
        # Description
        description = soup.find('div', 'commercial-description-container')
        if 'no commercial description' not in description.text.lower():
            # strip ads
            [s.extract() for s in description('small')]
            self.description = ' '.join([s for s in description.strings
                                         ]).strip()
        self.tags = [
            t.text[1:] for t in soup.find_all('span', class_="tagLink")
        ]

        self._has_fetched = True

        return self

示例#19

0

显示文件

文件： models.py 项目： Surye/ratebeer

    def __init__(self, url):

        soup = soup_helper._get_soup(url)
        # check for 404s
        try:
            soup_rows = soup.find('div', id='container').find('table').find_all('tr')
        except AttributeError:
            raise rb_exceptions.PageNotFound(url)
        # ratebeer pages don't actually 404, they just send you to this weird
        # "beer reference" page but the url doesn't actually change, it just
        # seems like it's all getting done server side -- so we have to look
        # for the contents h1 to see if we're looking at the beer reference or
        # not
        if "beer reference" in soup_rows[0].find_all('td')[1].h1.contents:
            raise rb_exceptions.PageNotFound(url)

        if "Also known as " in soup_rows[1].find_all('td')[1].div.div.contents:
            raise rb_exceptions.AliasedBeer(url, soup_rows[1].find_all('td')[1].div.div.a['href'])

        # get beer meta information
        # grab the html and split it into a keyword and value
        brew_info_html = soup_rows[1].find_all('td')[1].div.small
        brew_info = [s.split(': ') for s in brew_info_html.text.split(u'\xa0\xa0')]
        keyword_lookup = {
            "RATINGS": "num_ratings",
            "MEAN": "mean_rating",
            "WEIGHTED AVG": "weighted_avg",
            "SEASONAL": "seasonal",
            "CALORIES": "calories",
            "EST. CALORIES": "calories",
            "ABV": "abv",
            "IBU": "ibu",
        }
        # match the data pulled from the brew info and match it to they keyword
        # in the lookup table
        for meta_name, meta_data in brew_info:
            match = keyword_lookup.get(meta_name.strip())
            if match == "mean":
                meta_data = meta_data[:meta_data.find("/")]
            elif match == "abv":
                meta_data = meta_data[:-1]
            elif not match:
                continue
            # convert to float if possible
            try:
                if match == "num_ratings":
                    meta_data = int(meta_data)
                else:
                    meta_data = float(meta_data)
            except ValueError:
                pass
            setattr(self, match, meta_data)

        info = soup_rows[1].tr.find_all('td')

        # get basic brewery information
        brewery_info = info[1].find('div').contents
        brewery = brewery_info[0].findAll('a')[0]
        brewed_at = None
        if 'brewed at' in brewery_info[0].text.lower():
            brewed_at = brewery_info[0].findAll('a')[0]
        if brewery:
            self.brewery = brewery.text.strip()
            self.brewery_url = brewery.get('href')
        if brewed_at:
            self.brewed_at = brewed_at.text.strip()
            self.brewed_at_url = brewed_at.get('href')

        # get ratings
        ratings = info[0].findAll('div')
        if len(ratings) > 1:
            overall_rating = ratings[1].findAll('span')
            style_rating = ratings[3].findAll('span')
        else:
            overall_rating = None
            style_rating = None
        if overall_rating and overall_rating[1].text != 'n/a':
            self.overall_rating = int(overall_rating[1].text)
        if style_rating and style_rating[0].text != 'n/a':
            self.style_rating = int(style_rating[0].text)

        # get the beer style
        if brewery_info[3]:
            self.style = brewery_info[3].text.strip()

        # get the beer country
        if ',' in brewery_info[5]:
            # Non-USA addresses
            self.brewery_country = brewery_info[5].split(',')[1].strip()
        else:
            # USA addresses
            self.brewery_country = brewery_info[8].strip()

        # get the beer description
        description = soup_rows[1].find_all('td')[1].find(
            'div',
            style=(
                'border: 1px solid #e0e0e0; background: #fff; '
                'padding: 14px; color: #777;'
            )
        )
        if 'no commercial description' not in description.text.lower():
            # strip ads
            _ = [s.extract() for s in description('small')]
            self.description = ' '.join([s for s in description.strings]).strip()

        # get url
        self.url = soup.find('link', rel='canonical')['href'].replace(soup_helper._BASE_URL, '')

        # get name
        self.name = soup_rows[0].find_all('td')[1].h1.text.strip()

示例#20

0

显示文件

文件： models.py 项目： ivandjuricic/ratebeer

    def _populate(self):
        soup = soup_helper._get_soup(self.url)
        # check for 404sW
        try:
            soup_rows = soup.find("div", id="container").find("table").find_all("tr")
        except AttributeError:
            raise rb_exceptions.PageNotFound(self.url)
        # ratebeer pages don't actually 404, they just send you to this weird
        # "beer reference" page but the url doesn't actually change, it just
        # seems like it's all getting done server side -- so we have to look
        # for the contents h1 to see if we're looking at the beer reference or
        # not
        if "beer reference" in soup_rows[0].find_all("td")[1].h1.contents:
            raise rb_exceptions.PageNotFound(self.url)

        if "Also known as " in soup_rows[1].find_all("td")[1].div.div.contents:
            raise rb_exceptions.AliasedBeer(self.url, soup_rows[1].find_all("td")[1].div.div.a["href"])

        # get beer meta information
        # grab the html and split it into a keyword and value
        brew_info_divs = soup_rows[1].find_all("table")[0].find_all("div")
        brew_info_html = soup_rows[1].find_all("td")[1].div.small
        brew_info = [s.split(": ") for s in brew_info_html.text.split(u"\xa0\xa0")]

        # get ratings
        if "No Score" in brew_info_divs[0].text:
            self.overall_rating = None
            self.style_rating = None
            self.style = brew_info_divs[2].find_all("a")[1].text
        else:
            overalltxt = brew_info_divs[0].text
            self.overall_rating = int(brew_info_divs[0].text[-sum(c.isdigit() for c in overalltxt) :])
            styleratingtxt = brew_info_divs[3].text
            self.style_rating = int(brew_info_divs[3].text[: sum(c.isdigit() for c in overalltxt) :])

            self.style = brew_info_divs[4].find_all("a")[1].text

        keyword_lookup = {
            "RATINGS": "num_ratings",
            "MEAN": "mean_rating",
            "WEIGHTED AVG": "weighted_avg",
            "SEASONAL": "seasonal",
            "CALORIES": "calories",
            "EST. CALORIES": "calories",
            "ABV": "abv",
            "IBU": "ibu",
        }
        # match the data pulled from the brew info and match it to they keyword
        # in the lookup table
        for meta_name, meta_data in brew_info:
            match = keyword_lookup.get(meta_name.strip())
            if match == "mean":
                meta_data = meta_data[: meta_data.find("/")]
            elif match == "abv":
                meta_data = meta_data[:-1]
            elif not match:
                continue
            # convert to float if possible
            try:
                if match == "num_ratings":
                    meta_data = int(meta_data)
                else:
                    meta_data = float(meta_data)
            except ValueError:
                pass

            setattr(self, match, meta_data)

        # populate with image url also
        self.image = soup_rows[1].img["src"]
        setattr(self, "image", self.image)

        info = soup_rows[1].tr.find_all("td")

        # get basic brewery information
        brewery_info = info[1].find("div").contents
        brewery_urls = brewery_info[0].findAll("a")
        brewery = brewery_urls[0]
        brewed_at = None
        if len(brewery_urls) == 2:
            brewed_at = brewery_urls[1]
        if brewery:
            self.brewery = brewery.text.strip()
            self.brewery_url = brewery.get("href")
        if brewed_at:
            self.brewed_at = brewed_at.text.strip()
            self.brewed_at_url = brewed_at.get("href")

        # get the beer style
        if brewery_info[3]:
            self.style = brewery_info[3].text.strip()

        # get the beer country
        if "," in brewery_info[5]:
            # Non-USA addresses
            self.brewery_country = brewery_info[5].split(",")[1].strip()
        else:
            # USA addresses
            self.brewery_country = brewery_info[8].strip()

        # get the beer description
        description = (
            soup_rows[1]
            .find_all("td")[1]
            .find("div", style=("border: 1px solid #e0e0e0; background: #fff; " "padding: 14px; color: #777;"))
        )
        if "no commercial description" not in description.text.lower():
            # strip ads
            [s.extract() for s in description("small")]
            self.description = " ".join([s for s in description.strings]).strip()

        self.name = soup.h1.text
        self._has_fetched = True
        return self

示例#21

0

显示文件

    def _populate(self):
        soup = soup_helper._get_soup(self.url)
        # check for 404s
        try:
            soup_rows = soup.find('div',
                                  id='container').find('table').find_all('tr')
        except AttributeError:
            raise rb_exceptions.PageNotFound(self.url)
        # ratebeer pages don't actually 404, they just send you to this weird
        # "beer reference" page but the url doesn't actually change, it just
        # seems like it's all getting done server side -- so we have to look
        # for the contents h1 to see if we're looking at the beer reference or
        # not
        if "beer reference" in soup_rows[0].find_all('td')[1].h1.contents:
            raise rb_exceptions.PageNotFound(self.url)

        if "Also known as " in soup_rows[1].find_all('td')[1].div.div.contents:
            raise rb_exceptions.AliasedBeer(
                self.url, soup_rows[1].find_all('td')[1].div.div.a['href'])

        # get beer meta information
        # grab the html and split it into a keyword and value
        brew_info_html = soup_rows[1].find_all('td')[1].div.small
        brew_info = [
            s.split(': ') for s in brew_info_html.text.split(u'\xa0\xa0')
        ]
        keyword_lookup = {
            "RATINGS": "num_ratings",
            "MEAN": "mean_rating",
            "WEIGHTED AVG": "weighted_avg",
            "SEASONAL": "seasonal",
            "CALORIES": "calories",
            "EST. CALORIES": "calories",
            "ABV": "abv",
            "IBU": "ibu",
        }
        # match the data pulled from the brew info and match it to they keyword
        # in the lookup table
        for meta_name, meta_data in brew_info:
            match = keyword_lookup.get(meta_name.strip())
            if match == "mean":
                meta_data = meta_data[:meta_data.find("/")]
            elif match == "abv":
                meta_data = meta_data[:-1]
            elif not match:
                continue
            # convert to float if possible
            try:
                if match == "num_ratings":
                    meta_data = int(meta_data)
                else:
                    meta_data = float(meta_data)
            except ValueError:
                pass
            setattr(self, match, meta_data)

        info = soup_rows[1].tr.find_all('td')

        # get basic brewery information
        brewery_info = info[1].find('div').contents
        brewery_urls = brewery_info[0].findAll('a')
        brewery = brewery_urls[0]
        brewed_at = None
        if len(brewery_urls) == 2:
            brewed_at = brewery_urls[1]
        if brewery:
            self.brewery = brewery.text.strip()
            self.brewery_url = brewery.get('href')
        if brewed_at:
            self.brewed_at = brewed_at.text.strip()
            self.brewed_at_url = brewed_at.get('href')

        # get ratings
        ratings = info[0].findAll('div')
        if len(ratings) > 3:
            self.overall_rating = ratings[1].contents[2]
            self.style_rating = ratings[3].contents[0]

        # get the beer style
        if brewery_info[3]:
            self.style = brewery_info[3].text.strip()

        # get the beer country
        if ',' in brewery_info[5]:
            # Non-USA addresses
            self.brewery_country = brewery_info[5].split(',')[1].strip()
        else:
            # USA addresses
            self.brewery_country = brewery_info[8].strip()

        # get the beer description
        description = soup_rows[1].find_all('td')[1].find(
            'div',
            style=('border: 1px solid #e0e0e0; background: #fff; '
                   'padding: 14px; color: #777;'))
        if 'no commercial description' not in description.text.lower():
            # strip ads
            [s.extract() for s in description('small')]
            self.description = ' '.join([s for s in description.strings
                                         ]).strip()

        # get name
        self.name = soup_rows[0].find_all('td')[1].h1.text.strip()
        self._has_fetched = True

        return self

示例#22

0

显示文件

文件： models.py 项目： jwrubel/ratebeer

    def _populate(self):
        soup = soup_helper._get_soup(self.url)
        # check for 404s
        try:
            soup_rows = soup.find('div', id='container').find('table').find_all('tr')
        except AttributeError:
            raise rb_exceptions.PageNotFound(self.url)
        # ratebeer pages don't actually 404, they just send you to this weird
        # "beer reference" page but the url doesn't actually change, it just
        # seems like it's all getting done server side -- so we have to look
        # for the contents h1 to see if we're looking at the beer reference or
        # not
        if "beer reference" in soup_rows[0].find_all('td')[1].h1.contents:
            raise rb_exceptions.PageNotFound(self.url)

        if "Also known as " in soup_rows[1].find_all('td')[1].div.div.contents:
            raise rb_exceptions.AliasedBeer(self.url, soup_rows[1].find_all('td')[1].div.div.a['href'])

        # General information from the top of the page
        self.name = soup.find(itemprop='name').text.strip()
        breweries = soup.find_all('a', href=re.compile('brewers'))
        self.brewery = Brewery(breweries[1].get('href'))
        self.brewery.name = breweries[1].text
        if len(breweries) == 3:
            self.brewed_at = Brewery(breweries[2].get('href'))
            self.brewed_at.name = breweries[2].text
        else:
            self.brewed_at = None
        try:
            self.overall_rating = int(soup.find('span', text='overall').next_sibling.next_sibling.text)
        except ValueError: # 'n/a'
            self.overall_rating = None
        except AttributeError:
            self.overall_rating = None
        try:
            self.style_rating = int(soup.find('span', text='style').previous_sibling.previous_sibling)
        except ValueError: # 'n/a'
            self.style_rating = None
        except AttributeError:
            self.style_rating = None
        self.style = soup.find(text='Style: ').next_sibling.text
        self.style_url = soup.find(text='Style: ').next_sibling.get('href')
        self.img_url = soup.find(id="beerImg").get('src')
        # Data from the info bar
        self.num_ratings = int(soup.find('span', itemprop="ratingCount").text)
        try:
            self.mean_rating = float(soup.find(text='MEAN: ').next_sibling.text.split('/')[0])
        except ValueError: # Empty mean rating: '/5.0'
            self.mean_rating = None
        except AttributeError: # No mean rating
            self.mean_rating = None
        try:
            self.weighted_avg = float(soup.find(attrs={"name": "real average"}).find('span', itemprop="ratingValue").text)
        except ValueError: # Empty weighted average rating: '/5'
            self.weighted_avg = None
        except AttributeError: # No weighted average rating
            self.weighted_avg = None
        try:
            self.seasonal = soup.find(text=u'\xa0\xa0 SEASONAL: ').next_sibling.text
        except AttributeError:
            self.seasonal = None
        try:
            self.ibu = int(soup.find(title="International Bittering Units - Normally from hops").next_sibling.next_sibling.text)
        except AttributeError:
            self.ibu = None
        try:
            self.calories = int(soup.find(title="Estimated calories for a 12 fluid ounce serving").next_sibling.next_sibling.text)
        except AttributeError:
            self.calories = None
        try:
            self.abv = float(soup.find(title="Alcohol By Volume").next_sibling.next_sibling.text[:-1])
        except ValueError: # Empty ABV: '-'
            self.abv = None
        try:
            self.retired = soup.find(title="Currently out of production").string = "True"
        except AttributeError:
            self.retired = "False"
        # Description
        description = soup.find('div',
            style=(
                'border: 1px solid #e0e0e0; background: #fff; '
                'padding: 14px; color: #777;'
            )
        )
        if 'no commercial description' not in description.text.lower():
            # strip ads
            [s.extract() for s in description('small')]
            self.description = ' '.join([s for s in description.strings]).strip()
        self.tags = [t.text[1:] for t in soup.find_all('span', class_="tagLink")]

        self._has_fetched = True

        return self

示例#23

0

显示文件

文件： models.py 项目： shamrt/ratebeer

    def _populate(self):
        soup = soup_helper._get_soup(self.url)
        # check for 404s
        try:
            rbbody = soup.\
                find('div', id='rbbody').\
                find('div', id='container')
        except AttributeError:
            raise rb_exceptions.PageNotFound(self.url)
        # ratebeer pages don't actually 404, they just send you to this weird
        # "beer reference" page but the url doesn't actually change, it just
        # seems like it's all getting done server side -- so we have to look
        # for the contents h1 to see if we're looking at the beer reference or
        # not
        page_title = soup.find('div', id='rbbody').h1.get_text()
        if re.match("beer reference", page_title, re.IGNORECASE):
            raise rb_exceptions.PageNotFound(self.url)

        page_body = rbbody.\
            find_all('div', class_='row')[1].\
            find('div', class_='col-sm-8')
        if re.search("Also known as ", page_body.get_text()):
            alias_uri = page_body.a['href']
            raise rb_exceptions.AliasedBeer(self.url, alias_uri)

        # General information from the top of the page
        self.name = soup.find(itemprop='name').text.strip()
        breweries = soup.find_all('a', href=re.compile('brewers'))
        self.brewery = Brewery(breweries[1].get('href'))
        self.brewery.name = breweries[1].text
        if len(breweries) == 3:
            self.brewed_at = Brewery(breweries[2].get('href'))
            self.brewed_at.name = breweries[2].text
        else:
            self.brewed_at = None
        try:
            self.overall_rating = int(soup.find('span', text='overall').
                                      next_sibling.next_sibling.text)
        except ValueError:  # 'n/a'
            self.overall_rating = None
        except AttributeError:
            self.overall_rating = None
        try:
            self.style_rating = int(soup.find('span', text='style').
                                    previous_sibling.previous_sibling)
        except ValueError:  # 'n/a'
            self.style_rating = None
        except AttributeError:
            self.style_rating = None
        self.style = soup.find(text='Style: ').next_sibling.text
        self.style_url = soup.find(text='Style: ').next_sibling.get('href')
        self.img_url = soup.find(id="beerImg").get('src')
        # Data from the info bar
        self.num_ratings = int(soup.find('span', itemprop="ratingCount").text)
        try:
            self.mean_rating = float(soup.find(text='MEAN: ').
                                     next_sibling.text.split('/')[0])
        except ValueError:  # Empty mean rating: '/5.0'
            self.mean_rating = None
        except AttributeError:  # No mean rating
            self.mean_rating = None
        try:
            weight_avg_attrs = {"name": "real average"}
            self.weighted_avg = float(
                soup.find(attrs=weight_avg_attrs).
                find('span', itemprop="ratingValue").text
                )
        except ValueError:  # Empty weighted average rating: '/5'
            self.weighted_avg = None
        except AttributeError:  # No weighted average rating
            self.weighted_avg = None
        try:
            self.seasonal = soup.find(
                text=u'\xa0\xa0 SEASONAL: ').next_sibling.text
        except AttributeError:
            self.seasonal = None
        try:
            ibu_title_text = ("International Bittering Units - "
                              "Normally from hops")
            self.ibu = int(soup.find(title=ibu_title_text).
                           next_sibling.next_sibling.text)
        except AttributeError:
            self.ibu = None
        try:
            calories_title_text = ("Estimated calories for a 12 fluid ounce"
                                   " serving")
            self.calories = int(soup.find(title=calories_title_text).
                                next_sibling.next_sibling.text)
        except AttributeError:
            self.calories = None
        try:
            self.abv = float(soup.find(title="Alcohol By Volume").
                             next_sibling.next_sibling.text[:-1])
        except ValueError:  # Empty ABV: '-'
            self.abv = None
        if soup.find(title="Currently out of production"):
            self.retired = True
        else:
            self.retired = False

        # Description
        description = soup.find('span', itemprop='description')
        if not description:  # alternate object path
            description = soup.find(
                'div', class_='commercial-description-container')
        if 'no commercial description' not in description.text.lower():
            # strip ads
            [s.extract() for s in description('small')]
            self.description = ' '.join([s for s in description.strings]).\
                strip()
        self.tags = [t.text[1:] for t in soup.find_all(
            'span', class_="tagLink")]

        self._has_fetched = True

        return self