def get_beers(self): """Generator that provides Beer objects for the brewery's beers""" if not self._has_fetched: self._populate() _id = self.url.split('/')[-2] complete_url = u'/Ratings/Beer/ShowBrewerBeers.asp?BrewerID={0}'.format( _id) soup = soup_helper._get_soup(complete_url) soup_beer_rows = soup.find('table', id='brewer-beer-table').findAll('tr') for row in soup_beer_rows[1:]: url = row.a.get('href') # Only return rows that are ratable if not row.find('a', title="Rate this beer"): continue # Remove any whitespace characters. Rare, but possible. url = re.sub(r"\s+", "", url, flags=re.UNICODE) beer = Beer(url) beer.name = row.a.text.strip() # Add attributes from row abv = row.findAll('td')[1].text weighted_avg = row.findAll('td')[4].text.strip() style_rating = row.findAll('td')[5].text.strip() num_ratings = row.findAll('td')[6].text.strip() if abv: beer.abv = float(abv) if weighted_avg: beer.weighted_avg = float(weighted_avg) if style_rating: beer.style_rating = int(style_rating) if num_ratings: beer.num_ratings = int(num_ratings) yield beer
def _populate(self): """Returns information about a specific brewery. Args: url (string): The specific url of the beer. Looks like: "/brewers/new-belgium-brewing-company/77/" Returns: A dictionary of attributes about that brewery.""" soup = soup_helper._get_soup(self.url) s_contents = soup.find_all( 'div', {'itemtype': 'http://schema.org/LocalBusiness'}) if not s_contents: raise rb_exceptions.PageNotFound(self.url) self.name = soup.h1.text self.type = s_contents[0].find_all('div')[1].text.strip() website = s_contents[0].\ find_all('div', class_='media-links')[0].find_all('a')[0] if website: self.web = website['href'] self.telephone = Brewery._find_span(s_contents[0], 'telephone') self.street = Brewery._find_span(s_contents[0], 'streetAddress') self.city = Brewery._find_span(s_contents[0], 'addressLocality') self.state = Brewery._find_span(s_contents[0], 'addressRegion') self.country = Brewery._find_span(s_contents[0], 'addressCountry') self.postal_code = Brewery._find_span(s_contents[0], 'postalCode') self._has_fetched = True return self
def get_beers(self): """Generator that provides Beer objects for the brewery's beers""" if not self._has_fetched: self._populate() _id = self.url.split('/')[-2] complete_url = u'/Ratings/Beer/ShowBrewerBeers.asp?BrewerID={0}'.\ format(_id) soup = soup_helper._get_soup(complete_url) soup_beer_rows = soup.find( 'table', id='brewer-beer-table').findAll('tr') for row in soup_beer_rows[1:]: url = row.a.get('href') # Only return rows that are ratable if not row.find('a', title="Rate this beer"): continue # Remove any whitespace characters. Rare, but possible. url = re.sub(r"\s+", "", url, flags=re.UNICODE) beer = Beer(url) beer.name = row.a.text.strip() # Add attributes from row abv = row.findAll('td')[1].text weighted_avg = row.findAll('td')[4].text.strip() style_rating = row.findAll('td')[5].text.strip() num_ratings = row.findAll('td')[6].text.strip() if abv: beer.abv = float(abv) if weighted_avg: beer.weighted_avg = float(weighted_avg) if style_rating: beer.style_rating = int(style_rating) if num_ratings: beer.num_ratings = int(num_ratings) yield beer
def _populate(self): """Returns information about a specific brewery. Args: url (string): The specific url of the beer. Looks like: "/brewers/new-belgium-brewing-company/77/" Returns: A dictionary of attributes about that brewery.""" soup = soup_helper._get_soup(self.url) s_contents = soup.find_all( 'div', {'itemtype': 'http://schema.org/LocalBusiness'}) if not s_contents: raise rb_exceptions.PageNotFound(self.url) self.name = soup.h1.text self.type = s_contents[0].find_all('div')[1].text.strip() website = s_contents[0].find_all( 'div', {'class': 'media-links'})[0].find_all('a')[0] if website: self.web = website['href'] self.telephone = Brewery._find_span(s_contents[0], 'telephone') self.street = Brewery._find_span(s_contents[0], 'streetAddress') self.city = Brewery._find_span(s_contents[0], 'addressLocality') self.state = Brewery._find_span(s_contents[0], 'addressRegion') self.country = Brewery._find_span(s_contents[0], 'addressCountry') self.postal_code = Brewery._find_span(s_contents[0], 'postalCode') self._has_fetched = True return self
def _populate(self): """Returns information about a specific brewery. Args: url (string): The specific url of the beer. Looks like: "/brewers/new-belgium-brewing-company/77/" Returns: A dictionary of attributes about that brewery.""" soup = soup_helper._get_soup(self.url) try: s_contents = soup.find( 'div', id='container').find('table').find_all('tr')[0].find_all('td') except AttributeError: raise rb_exceptions.PageNotFound(self.url) self.name = soup.h1.text self.type = re.findall(r'Type: (.*?)<br\/>', soup.decode_contents())[0].strip() if soup.find_all(string='Web: '): self.web = soup.find_all(string='Web: ')[0].find_next()['href'] self.telephone = Brewery._find_span(s_contents[0], 'telephone') self.street = Brewery._find_span(s_contents[0], 'streetAddress') self.city = Brewery._find_span(s_contents[0], 'addressLocality') self.state = Brewery._find_span(s_contents[0], 'addressRegion') self.country = Brewery._find_span(s_contents[0], 'addressCountry') self.postal_code = Brewery._find_span(s_contents[0], 'postalCode') self._has_fetched = True return self
def beer_style(self, ident, sort_type=None, sort_order=None): """Get all the beers from a specific beer style page. Args: ident (integer): The ID of the beer style from beer_style_list(). For example, for 'Abbey Dubbel' it would be 71. sort_type (string): The sorting of the results. The valid choices are "score" (default), "count", and "abv". sort_order (string): "ascending" (low-to-high) or "descending" (high-to-low, default) Returns: A list of generator of beers. """ if sort_type is None: sort_type = 'score' if sort_order is None: sort_order = 'descending' sort_type = sort_type.lower() sort_order = sort_order.lower() so = {'score': 0, 'count': 1, 'abv': 2}.get(sort_type) o = {'descending': 0, 'ascending': 1}.get(sort_order) soup = soup_helper._get_soup('/ajax/top-beer.asp?s={}&so={}&o={}'.format(ident, so, o)) rows = iter(soup.table.find_all('tr')) next(rows) # Get rid of the header for row in rows: data = row.find_all('td') link = data[1].a dataout = models.Beer(link.get('href')) dataout.name = link.text yield dataout
def _populate(self): """Returns information about a specific brewery. Args: url (string): The specific url of the beer. Looks like: "/brewers/new-belgium-brewing-company/77/" Returns: A dictionary of attributes about that brewery.""" soup = soup_helper._get_soup(self.url) try: s_contents = soup.find('div', id='container').find('table').find_all('tr')[0].find_all('td') except AttributeError: raise rb_exceptions.PageNotFound(self.url) self.name = soup.h1.text self.type = re.findall(r'Type: (.*?)<br\/>', soup.decode_contents())[0].strip() if soup.find_all(string='Web: '): self.web = soup.find_all(string='Web: ')[0].find_next()['href'] self.telephone = Brewery._find_span(s_contents[0], 'telephone') self.street = Brewery._find_span(s_contents[0], 'streetAddress') self.city = Brewery._find_span(s_contents[0], 'addressLocality') self.state = Brewery._find_span(s_contents[0], 'addressRegion') self.country = Brewery._find_span(s_contents[0], 'addressCountry') self.postal_code = Brewery._find_span(s_contents[0], 'postalCode') self._has_fetched = True return self
def _populate(self): """Returns information about a specific brewery. Args: url (string): The specific url of the beer. Looks like: "/brewers/new-belgium-brewing-company/77/" Returns: A dictionary of attributes about that brewery.""" soup = soup_helper._get_soup(self.url) try: s_contents = soup.find("div", id="container").find("table").find_all("tr")[0].find_all("td") except AttributeError: raise rb_exceptions.PageNotFound(self.url) self.name = soup.h1.text self.type = re.findall(r"Type: (.*?)<br\/>", soup.decode_contents())[0].strip() if soup.find_all(string="Web: "): self.web = soup.find_all(string="Web: ")[0].find_next()["href"] self.telephone = Brewery._find_span(s_contents[0], "telephone") self.street = Brewery._find_span(s_contents[0], "streetAddress") self.city = Brewery._find_span(s_contents[0], "addressLocality") self.state = Brewery._find_span(s_contents[0], "addressRegion") self.country = Brewery._find_span(s_contents[0], "addressCountry") self.postal_code = Brewery._find_span(s_contents[0], "postalCode") self._has_fetched = True return self
def beer_style_list(self): """Returns the beer styles from the beer styles page. Returns: A dictionary, with beer styles strings for keys and integer ids for values. """ styles = {} soup = soup_helper._get_soup("/top/") for item in [i for i in soup.find('select', id="StyleMenu").find_all('option') if i.get('name')]: styles[item.text.strip()] = int(item.get('value')) return styles
def beer_style_list(self): """Returns the beer styles from the beer styles page. Returns: A dictionary, with beer styles for keys and urls for values. """ styles = {} soup = soup_helper._get_soup("/beerstyles/") columns = soup.find_all('table')[2].find_all('td') for column in columns: lines = [li for li in column.find_all('li')] for line in lines: styles[line.text] = line.a.get('href') return styles
def get_reviews(self, review_order="most recent"): """Returns reviews for a specific beer. Args: url (string): The specific url of the beer. Looks like: "/beer/deschutes-inversion-ipa/55610/" review_order (string): How to sort reviews. Three inputs: most recent: Newer reviews appear earlier. top raters: RateBeer.com top raters appear earlier. highest score: Reviews with the highest overall score appear earlier. Returns: A generator of dictionaries, containing the information about the review. """ if not self._has_fetched: self._populate() review_order = review_order.lower() url_codes = { "most recent": 1, "top raters": 2, "highest score": 3 } url_flag = url_codes.get(review_order) if not url_flag: raise ValueError("Invalid ``review_order``.") page_number = 1 while True: complete_url = u'{0}{1}/{2}/'.format( self.url, url_flag, page_number) soup = soup_helper._get_soup(complete_url) reviews_container = soup.find('div', class_='reviews-container') reviews = reviews_container.find_all( 'div', style='padding: 0px 0px 0px 0px;') if len(reviews) < 1: raise StopIteration for review_soup in reviews: yield Review(review_soup) page_number += 1
def get_beers(self): """Generator that provides Beer objects for the brewery's beers""" if not self._has_fetched: self._populate() page_number = 1 while True: complete_url = u'{0}0/{1}/'.format(self.url, page_number) soup = soup_helper._get_soup(complete_url) soup_beer_rows = soup.find('table', 'maintable nohover').findAll('tr') if len(soup_beer_rows) < 2: raise StopIteration for row in soup_beer_rows[1:]: url = row.a.get('href') # Only return rows that are ratable if not row.find(class_='rate'): continue # Remove any whitespace characters. Rare, but possible. url = re.sub(r"\s+", "", url, flags=re.UNICODE) beer = Beer(url) beer.name = row.a.text.strip() # Add attributes from row abv = row.findAll('td')[2].text weighted_avg = row.findAll('td')[3].text.strip() overall_rating = row.findAll('td')[4].text.strip() style_rating = row.findAll('td')[5].text.strip() num_ratings = row.findAll('td')[6].text.strip() if abv: beer.abv = float(abv) if weighted_avg: beer.weighted_avg = float(weighted_avg) if overall_rating: beer.overall_rating = int(overall_rating) if style_rating: beer.style_rating = int(style_rating) if num_ratings: beer.num_ratings = int(num_ratings) yield beer page_number += 1
def get_beers(self): """Generator that provides Beer objects for the brewery's beers""" if not self._has_fetched: self._populate() page_number = 1 while True: complete_url = u"{0}0/{1}/".format(self.url, page_number) soup = soup_helper._get_soup(complete_url) soup_beer_rows = soup.find("table", "maintable nohover").findAll("tr") if len(soup_beer_rows) < 2: raise StopIteration for row in soup_beer_rows[1:]: url = row.a.get("href") # Only return rows that are ratable if not row.find(class_="rate"): continue # Remove any whitespace characters. Rare, but possible. url = re.sub(r"\s+", "", url, flags=re.UNICODE) beer = Beer(url) beer.name = row.a.text.strip() # Add attributes from row abv = row.findAll("td")[2].text weighted_avg = row.findAll("td")[3].text.strip() overall_rating = row.findAll("td")[4].text.strip() style_rating = row.findAll("td")[5].text.strip() num_ratings = row.findAll("td")[6].text.strip() if abv: beer.abv = float(abv) if weighted_avg: beer.weighted_avg = float(weighted_avg) if overall_rating: beer.overall_rating = int(overall_rating) if style_rating: beer.style_rating = int(style_rating) if num_ratings: beer.num_ratings = int(num_ratings) yield beer page_number += 1
def get_reviews(self, review_order="most recent"): """Returns reviews for a specific beer. Args: url (string): The specific url of the beer. Looks like: "/beer/deschutes-inversion-ipa/55610/" review_order (string): How to sort reviews. Three inputs: most recent: Newer reviews appear earlier. top raters: RateBeer.com top raters appear earlier. highest score: Reviews with the highest overall score appear earlier. Returns: A generator of dictionaries, containing the information about the review. """ if not self._has_fetched: self._populate() review_order = review_order.lower() url_codes = { "most recent": 1, "top raters": 2, "highest score": 3 } url_flag = url_codes.get(review_order) if not url_flag: raise ValueError("Invalid ``review_order``.") page_number = 1 while True: complete_url = u'{0}{1}/{2}/'.format(self.url, url_flag, page_number) soup = soup_helper._get_soup(complete_url) content = soup.find('div', class_='reviews-container') reviews = content.find_all('div', style='padding: 0px 0px 0px 0px;') if len(reviews) < 1: raise StopIteration for review_soup in reviews: yield Review(review_soup) page_number += 1
def get_beers(self): page_number = 1 while True: complete_url = u'{0}0/{1}/'.format(self.url, page_number) soup = soup_helper._get_soup(complete_url) soup_beer_rows = soup.find('table', 'maintable nohover').findAll('tr') if len(soup_beer_rows) < 2: raise StopIteration for row in soup_beer_rows[1:]: url = row.a.get('href') # Only return rows that are ratable if not row.find(class_='rate'): continue # sometimes the beer is listed but it doesn't have a page # ignore it for now try: beer = Beer(url) except rb_exceptions.PageNotFound: continue yield beer page_number += 1
def _populate(self): soup = soup_helper._get_soup(self.url) # check for 404s try: soup_rows = soup.find('div', id='container').find('table').find_all('tr') except AttributeError: raise rb_exceptions.PageNotFound(self.url) # ratebeer pages don't actually 404, they just send you to this weird # "beer reference" page but the url doesn't actually change, it just # seems like it's all getting done server side -- so we have to look # for the contents h1 to see if we're looking at the beer reference or # not # if "beer reference" in soup_rows[0].find_all('td')[1].h1.contents: # raise rb_exceptions.PageNotFound(self.url) # if "Also known as " in soup_rows[1].find_all('td')[1].div.div.contents: # raise rb_exceptions.AliasedBeer(self.url, soup_rows[1].find_all('td')[1].div.div.a['href']) if soup_rows is None: raise rb_exceptions.PageNotFound(self.url) # General information from the top of the page self.name = soup.find(itemprop='name').text.strip() breweries = soup.find_all('a', href=re.compile('brewers')) self.brewery = Brewery(breweries[1].get('href')) self.brewery.name = breweries[1].text if len(breweries) == 3: self.brewed_at = Brewery(breweries[2].get('href')) self.brewed_at.name = breweries[2].text else: self.brewed_at = None try: self.overall_rating = int( soup.find('span', text='overall').next_sibling.next_sibling.text) except ValueError: # 'n/a' self.overall_rating = None except AttributeError: self.overall_rating = None try: self.style_rating = int( soup.find('span', text='style').previous_sibling.previous_sibling) except ValueError: # 'n/a' self.style_rating = None except AttributeError: self.style_rating = None self.style = soup.find(text='Style: ').next_sibling.text self.style_url = soup.find(text='Style: ').next_sibling.get('href') self.img_url = soup.find(id="beerImg").get('src') # Data from the info bar self.num_ratings = int(soup.find('span', itemprop="ratingCount").text) try: self.mean_rating = float( soup.find(text='MEAN: ').next_sibling.text.split('/')[0]) except ValueError: # Empty mean rating: '/5.0' self.mean_rating = None except AttributeError: # No mean rating self.mean_rating = None try: self.weighted_avg = float( soup.find(attrs={ "name": "real average" }).find('span', itemprop="ratingValue").text) except ValueError: # Empty weighted average rating: '/5' self.weighted_avg = None except AttributeError: # No weighted average rating self.weighted_avg = None try: self.seasonal = soup.find( text=u'\xa0\xa0 SEASONAL: ').next_sibling.text except AttributeError: self.seasonal = None try: self.ibu = int( soup.find( title="International Bittering Units - Normally from hops" ).next_sibling.next_sibling.text) except AttributeError: self.ibu = None try: self.calories = int( soup.find( title="Estimated calories for a 12 fluid ounce serving"). next_sibling.next_sibling.text) except AttributeError: self.calories = None try: self.abv = float( soup.find(title="Alcohol By Volume").next_sibling.next_sibling. text[:-1]) except ValueError: # Empty ABV: '-' self.abv = None if soup.find(title="Currently out of production"): self.retired = True else: self.retired = False # Description description = soup.find('div', 'commercial-description-container') if 'no commercial description' not in description.text.lower(): # strip ads [s.extract() for s in description('small')] self.description = ' '.join([s for s in description.strings ]).strip() self.tags = [ t.text[1:] for t in soup.find_all('span', class_="tagLink") ] self._has_fetched = True return self
def __init__(self, url): soup = soup_helper._get_soup(url) # check for 404s try: soup_rows = soup.find('div', id='container').find('table').find_all('tr') except AttributeError: raise rb_exceptions.PageNotFound(url) # ratebeer pages don't actually 404, they just send you to this weird # "beer reference" page but the url doesn't actually change, it just # seems like it's all getting done server side -- so we have to look # for the contents h1 to see if we're looking at the beer reference or # not if "beer reference" in soup_rows[0].find_all('td')[1].h1.contents: raise rb_exceptions.PageNotFound(url) if "Also known as " in soup_rows[1].find_all('td')[1].div.div.contents: raise rb_exceptions.AliasedBeer(url, soup_rows[1].find_all('td')[1].div.div.a['href']) # get beer meta information # grab the html and split it into a keyword and value brew_info_html = soup_rows[1].find_all('td')[1].div.small brew_info = [s.split(': ') for s in brew_info_html.text.split(u'\xa0\xa0')] keyword_lookup = { "RATINGS": "num_ratings", "MEAN": "mean_rating", "WEIGHTED AVG": "weighted_avg", "SEASONAL": "seasonal", "CALORIES": "calories", "EST. CALORIES": "calories", "ABV": "abv", "IBU": "ibu", } # match the data pulled from the brew info and match it to they keyword # in the lookup table for meta_name, meta_data in brew_info: match = keyword_lookup.get(meta_name.strip()) if match == "mean": meta_data = meta_data[:meta_data.find("/")] elif match == "abv": meta_data = meta_data[:-1] elif not match: continue # convert to float if possible try: if match == "num_ratings": meta_data = int(meta_data) else: meta_data = float(meta_data) except ValueError: pass setattr(self, match, meta_data) info = soup_rows[1].tr.find_all('td') # get basic brewery information brewery_info = info[1].find('div').contents brewery = brewery_info[0].findAll('a')[0] brewed_at = None if 'brewed at' in brewery_info[0].text.lower(): brewed_at = brewery_info[0].findAll('a')[0] if brewery: self.brewery = brewery.text.strip() self.brewery_url = brewery.get('href') if brewed_at: self.brewed_at = brewed_at.text.strip() self.brewed_at_url = brewed_at.get('href') # get ratings ratings = info[0].findAll('div') if len(ratings) > 1: overall_rating = ratings[1].findAll('span') style_rating = ratings[3].findAll('span') else: overall_rating = None style_rating = None if overall_rating and overall_rating[1].text != 'n/a': self.overall_rating = int(overall_rating[1].text) if style_rating and style_rating[0].text != 'n/a': self.style_rating = int(style_rating[0].text) # get the beer style if brewery_info[3]: self.style = brewery_info[3].text.strip() # get the beer country if ',' in brewery_info[5]: # Non-USA addresses self.brewery_country = brewery_info[5].split(',')[1].strip() else: # USA addresses self.brewery_country = brewery_info[8].strip() # get the beer description description = soup_rows[1].find_all('td')[1].find( 'div', style=( 'border: 1px solid #e0e0e0; background: #fff; ' 'padding: 14px; color: #777;' ) ) if 'no commercial description' not in description.text.lower(): # strip ads _ = [s.extract() for s in description('small')] self.description = ' '.join([s for s in description.strings]).strip() # get url self.url = soup.find('link', rel='canonical')['href'].replace(soup_helper._BASE_URL, '') # get name self.name = soup_rows[0].find_all('td')[1].h1.text.strip()
def _populate(self): soup = soup_helper._get_soup(self.url) # check for 404sW try: soup_rows = soup.find("div", id="container").find("table").find_all("tr") except AttributeError: raise rb_exceptions.PageNotFound(self.url) # ratebeer pages don't actually 404, they just send you to this weird # "beer reference" page but the url doesn't actually change, it just # seems like it's all getting done server side -- so we have to look # for the contents h1 to see if we're looking at the beer reference or # not if "beer reference" in soup_rows[0].find_all("td")[1].h1.contents: raise rb_exceptions.PageNotFound(self.url) if "Also known as " in soup_rows[1].find_all("td")[1].div.div.contents: raise rb_exceptions.AliasedBeer(self.url, soup_rows[1].find_all("td")[1].div.div.a["href"]) # get beer meta information # grab the html and split it into a keyword and value brew_info_divs = soup_rows[1].find_all("table")[0].find_all("div") brew_info_html = soup_rows[1].find_all("td")[1].div.small brew_info = [s.split(": ") for s in brew_info_html.text.split(u"\xa0\xa0")] # get ratings if "No Score" in brew_info_divs[0].text: self.overall_rating = None self.style_rating = None self.style = brew_info_divs[2].find_all("a")[1].text else: overalltxt = brew_info_divs[0].text self.overall_rating = int(brew_info_divs[0].text[-sum(c.isdigit() for c in overalltxt) :]) styleratingtxt = brew_info_divs[3].text self.style_rating = int(brew_info_divs[3].text[: sum(c.isdigit() for c in overalltxt) :]) self.style = brew_info_divs[4].find_all("a")[1].text keyword_lookup = { "RATINGS": "num_ratings", "MEAN": "mean_rating", "WEIGHTED AVG": "weighted_avg", "SEASONAL": "seasonal", "CALORIES": "calories", "EST. CALORIES": "calories", "ABV": "abv", "IBU": "ibu", } # match the data pulled from the brew info and match it to they keyword # in the lookup table for meta_name, meta_data in brew_info: match = keyword_lookup.get(meta_name.strip()) if match == "mean": meta_data = meta_data[: meta_data.find("/")] elif match == "abv": meta_data = meta_data[:-1] elif not match: continue # convert to float if possible try: if match == "num_ratings": meta_data = int(meta_data) else: meta_data = float(meta_data) except ValueError: pass setattr(self, match, meta_data) # populate with image url also self.image = soup_rows[1].img["src"] setattr(self, "image", self.image) info = soup_rows[1].tr.find_all("td") # get basic brewery information brewery_info = info[1].find("div").contents brewery_urls = brewery_info[0].findAll("a") brewery = brewery_urls[0] brewed_at = None if len(brewery_urls) == 2: brewed_at = brewery_urls[1] if brewery: self.brewery = brewery.text.strip() self.brewery_url = brewery.get("href") if brewed_at: self.brewed_at = brewed_at.text.strip() self.brewed_at_url = brewed_at.get("href") # get the beer style if brewery_info[3]: self.style = brewery_info[3].text.strip() # get the beer country if "," in brewery_info[5]: # Non-USA addresses self.brewery_country = brewery_info[5].split(",")[1].strip() else: # USA addresses self.brewery_country = brewery_info[8].strip() # get the beer description description = ( soup_rows[1] .find_all("td")[1] .find("div", style=("border: 1px solid #e0e0e0; background: #fff; " "padding: 14px; color: #777;")) ) if "no commercial description" not in description.text.lower(): # strip ads [s.extract() for s in description("small")] self.description = " ".join([s for s in description.strings]).strip() self.name = soup.h1.text self._has_fetched = True return self
def _populate(self): soup = soup_helper._get_soup(self.url) # check for 404s try: soup_rows = soup.find('div', id='container').find('table').find_all('tr') except AttributeError: raise rb_exceptions.PageNotFound(self.url) # ratebeer pages don't actually 404, they just send you to this weird # "beer reference" page but the url doesn't actually change, it just # seems like it's all getting done server side -- so we have to look # for the contents h1 to see if we're looking at the beer reference or # not if "beer reference" in soup_rows[0].find_all('td')[1].h1.contents: raise rb_exceptions.PageNotFound(self.url) if "Also known as " in soup_rows[1].find_all('td')[1].div.div.contents: raise rb_exceptions.AliasedBeer( self.url, soup_rows[1].find_all('td')[1].div.div.a['href']) # get beer meta information # grab the html and split it into a keyword and value brew_info_html = soup_rows[1].find_all('td')[1].div.small brew_info = [ s.split(': ') for s in brew_info_html.text.split(u'\xa0\xa0') ] keyword_lookup = { "RATINGS": "num_ratings", "MEAN": "mean_rating", "WEIGHTED AVG": "weighted_avg", "SEASONAL": "seasonal", "CALORIES": "calories", "EST. CALORIES": "calories", "ABV": "abv", "IBU": "ibu", } # match the data pulled from the brew info and match it to they keyword # in the lookup table for meta_name, meta_data in brew_info: match = keyword_lookup.get(meta_name.strip()) if match == "mean": meta_data = meta_data[:meta_data.find("/")] elif match == "abv": meta_data = meta_data[:-1] elif not match: continue # convert to float if possible try: if match == "num_ratings": meta_data = int(meta_data) else: meta_data = float(meta_data) except ValueError: pass setattr(self, match, meta_data) info = soup_rows[1].tr.find_all('td') # get basic brewery information brewery_info = info[1].find('div').contents brewery_urls = brewery_info[0].findAll('a') brewery = brewery_urls[0] brewed_at = None if len(brewery_urls) == 2: brewed_at = brewery_urls[1] if brewery: self.brewery = brewery.text.strip() self.brewery_url = brewery.get('href') if brewed_at: self.brewed_at = brewed_at.text.strip() self.brewed_at_url = brewed_at.get('href') # get ratings ratings = info[0].findAll('div') if len(ratings) > 3: self.overall_rating = ratings[1].contents[2] self.style_rating = ratings[3].contents[0] # get the beer style if brewery_info[3]: self.style = brewery_info[3].text.strip() # get the beer country if ',' in brewery_info[5]: # Non-USA addresses self.brewery_country = brewery_info[5].split(',')[1].strip() else: # USA addresses self.brewery_country = brewery_info[8].strip() # get the beer description description = soup_rows[1].find_all('td')[1].find( 'div', style=('border: 1px solid #e0e0e0; background: #fff; ' 'padding: 14px; color: #777;')) if 'no commercial description' not in description.text.lower(): # strip ads [s.extract() for s in description('small')] self.description = ' '.join([s for s in description.strings ]).strip() # get name self.name = soup_rows[0].find_all('td')[1].h1.text.strip() self._has_fetched = True return self
def _populate(self): soup = soup_helper._get_soup(self.url) # check for 404s try: soup_rows = soup.find('div', id='container').find('table').find_all('tr') except AttributeError: raise rb_exceptions.PageNotFound(self.url) # ratebeer pages don't actually 404, they just send you to this weird # "beer reference" page but the url doesn't actually change, it just # seems like it's all getting done server side -- so we have to look # for the contents h1 to see if we're looking at the beer reference or # not if "beer reference" in soup_rows[0].find_all('td')[1].h1.contents: raise rb_exceptions.PageNotFound(self.url) if "Also known as " in soup_rows[1].find_all('td')[1].div.div.contents: raise rb_exceptions.AliasedBeer(self.url, soup_rows[1].find_all('td')[1].div.div.a['href']) # General information from the top of the page self.name = soup.find(itemprop='name').text.strip() breweries = soup.find_all('a', href=re.compile('brewers')) self.brewery = Brewery(breweries[1].get('href')) self.brewery.name = breweries[1].text if len(breweries) == 3: self.brewed_at = Brewery(breweries[2].get('href')) self.brewed_at.name = breweries[2].text else: self.brewed_at = None try: self.overall_rating = int(soup.find('span', text='overall').next_sibling.next_sibling.text) except ValueError: # 'n/a' self.overall_rating = None except AttributeError: self.overall_rating = None try: self.style_rating = int(soup.find('span', text='style').previous_sibling.previous_sibling) except ValueError: # 'n/a' self.style_rating = None except AttributeError: self.style_rating = None self.style = soup.find(text='Style: ').next_sibling.text self.style_url = soup.find(text='Style: ').next_sibling.get('href') self.img_url = soup.find(id="beerImg").get('src') # Data from the info bar self.num_ratings = int(soup.find('span', itemprop="ratingCount").text) try: self.mean_rating = float(soup.find(text='MEAN: ').next_sibling.text.split('/')[0]) except ValueError: # Empty mean rating: '/5.0' self.mean_rating = None except AttributeError: # No mean rating self.mean_rating = None try: self.weighted_avg = float(soup.find(attrs={"name": "real average"}).find('span', itemprop="ratingValue").text) except ValueError: # Empty weighted average rating: '/5' self.weighted_avg = None except AttributeError: # No weighted average rating self.weighted_avg = None try: self.seasonal = soup.find(text=u'\xa0\xa0 SEASONAL: ').next_sibling.text except AttributeError: self.seasonal = None try: self.ibu = int(soup.find(title="International Bittering Units - Normally from hops").next_sibling.next_sibling.text) except AttributeError: self.ibu = None try: self.calories = int(soup.find(title="Estimated calories for a 12 fluid ounce serving").next_sibling.next_sibling.text) except AttributeError: self.calories = None try: self.abv = float(soup.find(title="Alcohol By Volume").next_sibling.next_sibling.text[:-1]) except ValueError: # Empty ABV: '-' self.abv = None try: self.retired = soup.find(title="Currently out of production").string = "True" except AttributeError: self.retired = "False" # Description description = soup.find('div', style=( 'border: 1px solid #e0e0e0; background: #fff; ' 'padding: 14px; color: #777;' ) ) if 'no commercial description' not in description.text.lower(): # strip ads [s.extract() for s in description('small')] self.description = ' '.join([s for s in description.strings]).strip() self.tags = [t.text[1:] for t in soup.find_all('span', class_="tagLink")] self._has_fetched = True return self
def _populate(self): soup = soup_helper._get_soup(self.url) # check for 404s try: rbbody = soup.\ find('div', id='rbbody').\ find('div', id='container') except AttributeError: raise rb_exceptions.PageNotFound(self.url) # ratebeer pages don't actually 404, they just send you to this weird # "beer reference" page but the url doesn't actually change, it just # seems like it's all getting done server side -- so we have to look # for the contents h1 to see if we're looking at the beer reference or # not page_title = soup.find('div', id='rbbody').h1.get_text() if re.match("beer reference", page_title, re.IGNORECASE): raise rb_exceptions.PageNotFound(self.url) page_body = rbbody.\ find_all('div', class_='row')[1].\ find('div', class_='col-sm-8') if re.search("Also known as ", page_body.get_text()): alias_uri = page_body.a['href'] raise rb_exceptions.AliasedBeer(self.url, alias_uri) # General information from the top of the page self.name = soup.find(itemprop='name').text.strip() breweries = soup.find_all('a', href=re.compile('brewers')) self.brewery = Brewery(breweries[1].get('href')) self.brewery.name = breweries[1].text if len(breweries) == 3: self.brewed_at = Brewery(breweries[2].get('href')) self.brewed_at.name = breweries[2].text else: self.brewed_at = None try: self.overall_rating = int(soup.find('span', text='overall'). next_sibling.next_sibling.text) except ValueError: # 'n/a' self.overall_rating = None except AttributeError: self.overall_rating = None try: self.style_rating = int(soup.find('span', text='style'). previous_sibling.previous_sibling) except ValueError: # 'n/a' self.style_rating = None except AttributeError: self.style_rating = None self.style = soup.find(text='Style: ').next_sibling.text self.style_url = soup.find(text='Style: ').next_sibling.get('href') self.img_url = soup.find(id="beerImg").get('src') # Data from the info bar self.num_ratings = int(soup.find('span', itemprop="ratingCount").text) try: self.mean_rating = float(soup.find(text='MEAN: '). next_sibling.text.split('/')[0]) except ValueError: # Empty mean rating: '/5.0' self.mean_rating = None except AttributeError: # No mean rating self.mean_rating = None try: weight_avg_attrs = {"name": "real average"} self.weighted_avg = float( soup.find(attrs=weight_avg_attrs). find('span', itemprop="ratingValue").text ) except ValueError: # Empty weighted average rating: '/5' self.weighted_avg = None except AttributeError: # No weighted average rating self.weighted_avg = None try: self.seasonal = soup.find( text=u'\xa0\xa0 SEASONAL: ').next_sibling.text except AttributeError: self.seasonal = None try: ibu_title_text = ("International Bittering Units - " "Normally from hops") self.ibu = int(soup.find(title=ibu_title_text). next_sibling.next_sibling.text) except AttributeError: self.ibu = None try: calories_title_text = ("Estimated calories for a 12 fluid ounce" " serving") self.calories = int(soup.find(title=calories_title_text). next_sibling.next_sibling.text) except AttributeError: self.calories = None try: self.abv = float(soup.find(title="Alcohol By Volume"). next_sibling.next_sibling.text[:-1]) except ValueError: # Empty ABV: '-' self.abv = None if soup.find(title="Currently out of production"): self.retired = True else: self.retired = False # Description description = soup.find('span', itemprop='description') if not description: # alternate object path description = soup.find( 'div', class_='commercial-description-container') if 'no commercial description' not in description.text.lower(): # strip ads [s.extract() for s in description('small')] self.description = ' '.join([s for s in description.strings]).\ strip() self.tags = [t.text[1:] for t in soup.find_all( 'span', class_="tagLink")] self._has_fetched = True return self