def scrape_bluebottle(): roaster = 'Blue Bottle' bluebottle = 'https://bluebottlecoffee.com/store/coffee' r = requests.get(bluebottle) soup = BeautifulSoup(r.content, "html.parser") coffees_for_sale = soup.find_all('h2', {'class':'f5 lh-title man'}) total_coffees = len(coffees_for_sale) coffees_entered = 0 coffees_updated = 0 error_coffees = [] ignored = ['Box', 'Kit', 'Subscriptions', 'at Home'] for item in coffees_for_sale: name,description,notes,region,active,size, product_url = [""]*7 price = float() name = item.string if any(word in name for word in ignored): total_coffees -= 1 else: url = item.a['href'] product_url = 'https://bluebottlecoffee.com' + url logging.info("Getting url: {}".format(url)) r = requests.get(product_url) coffee_soup = BeautifulSoup(r.content, "html.parser") active = True price = float(coffee_soup.find('span', {'class':'js-variant-price'}).string) description = coffee_soup.find('p', {'class':'spec-overview'}).string notes = coffee_soup.p.string.lower().split(',') # only works for not single origin region = country_from_name(name) try: details = coffee_soup.find('p', {'class':'spec-details'}).contents[0].strip() if country_from_name(details) != '': region = details except AttributeError: # if it's an espresso, then it's okay to not have region if 'Espresso' in name: region = "" size = coffee_soup.find('select', {'id':'cart_item_model_id'}).option.string.split('Bag')[0] image_url = coffee_soup.img['src'] image_content = requests.get(image_url).content coffee_data = {'name': name, 'roaster': roaster, 'description': description, 'price': price, 'notes': notes, 'region': region, 'active': active, 'product_page': product_url, 'size': size, 'image': image_content} coffees_updated, coffees_entered, error_coffees = add_or_update_coffee(coffee_data, coffees_updated, coffees_entered, error_coffees) logging.info('Blue Bottle New Results:{} / {}'.format(coffees_entered, total_coffees)) logging.info('Blue Bottle Updated Results:{} / {}'.format(coffees_updated, total_coffees)) if error_coffees: logging.warning('Blue Bottle Error coffees are: {}'.format(error_coffees))
def scrape_stumptown(): roaster = 'Stumptown' stumptown = 'https://www.stumptowncoffee.com/coffee' r = requests.get(stumptown) soup = BeautifulSoup(r.content, "html.parser") # class="product-grid _link" coffees_for_sale = soup.find_all('a', {'class':'product-grid _link'}) # keeping track of how many coffees total_coffees = len(coffees_for_sale) coffees_entered = 0 coffees_updated = 0 error_coffees = [] for items in coffees_for_sale: url = items['href'] if not 'trio' in url: name,price,description,notes,region,active,size = [""]*7 product_url = 'https://www.stumptowncoffee.com'+url logging.info("Getting url: {}".format(url)) r = requests.get(product_url) coffee_soup = BeautifulSoup(r.content, "html.parser") # product name h1 class="product _title -desktop theme-color js-pdp-title" name = coffee_soup.h1.string.strip() try: price = float(coffee_soup.find_all('span',{'class':'js-pdp-price'})[0].string) except IndexError as e: logging.warn("Error while getting price for {} : {}".format(name, e)) # div class="product _description description = coffee_soup.find('div', {'class':'product _description'}).p.string try: notes = coffee_soup.h3.string.replace('&',',').lower().split(',') except AttributeError: # no notes found pass region = country_from_name(name) if coffee_soup.h6: # its sold out active = False else: active = True # size in ounces try: size = '{} oz'.format(re.findall('\d+', coffee_soup.find('div', {'class':'product _specs'}).find_all('p')[1].string)[0]) except Exception as e: logging.warn("Error while getting size for {} : {}".format(name, e)) image_url = coffee_soup.select('div.product._image')[0].find('span')['data-src'] image_content = requests.get(image_url).content coffee_data = {'name': name, 'roaster': roaster, 'description': description, 'price': price, 'notes': notes, 'region': region, 'active': active, 'product_page': product_url, 'size': size, 'image': image_content} coffees_updated, coffees_entered, error_coffees = add_or_update_coffee(coffee_data, coffees_updated, coffees_entered, error_coffees) else: total_coffees -= 1 logging.info('Stumptown New Results:{} / {}'.format(coffees_entered, total_coffees)) logging.info('Stumptown Updated Results:{} / {}'.format(coffees_updated, total_coffees)) if error_coffees: logging.warning('Stumptown Error coffees are: {}'.format(error_coffees))
def scrape_heart(): roaster = 'Heart' heart_beans = 'http://www.heartroasters.com/collections/beans' heart_url = 'http://www.heartroasters.com' r = requests.get(heart_beans) soup = BeautifulSoup(r.content, "html.parser") all_coffees_for_sale = soup.find_all('a', {'class':'grid__image'}) all_coffee_links = [] for coffee in all_coffees_for_sale: if not 'Subscription' in coffee.find('img')['alt']: all_coffee_links.append("{}{}".format(heart_url, coffee['href'])) total_coffees = len(all_coffee_links) coffees_entered = 0 coffees_updated = 0 error_coffees = [] for url in all_coffee_links: name,price,description,notes,region,active,size = [""] * 7 logging.info("Getting url: {}".format(url)) r = requests.get(url) coffee_soup = BeautifulSoup(r.content, "html.parser") blend = False active = True name = coffee_soup.h1.text.strip() if 'blend' in name.lower(): blend = True size_price = coffee_soup.find('option').text size = size_price.split(" - ")[0] if 'Sold Out' in size_price: active = False price = 0 else: price = float(size_price.split(" - ")[1].replace('USD', '').replace('$', '')) description = coffee_soup.find('div', {'class':'tab-content small'}).find('div',{'id': 'tab1'}).text.encode('utf-8').strip() notes = coffee_soup.find('p',{'class': 'small uppercase flavors'}).text.split(',') if not blend: region = country_from_name(name) # region = coffee_soup.find('div', {'class':'tab-content small'}).find('div',{'id': 'tab1'}).p.text.replace(u'Location:\xa0', '').replace('Location:', '').encode('utf-8') image_url = "http:{}".format(coffee_soup.select('div.slide')[0].find('img')['src']) image_content = requests.get(image_url).content coffee_data = {'name': name, 'roaster': roaster, 'description': description, 'price': price, 'notes': notes, 'region': region, 'active': active, 'product_page': url, 'size': size, 'image': image_content} coffees_updated, coffees_entered, error_coffees = add_or_update_coffee(coffee_data, coffees_updated, coffees_entered, error_coffees) logging.info('Heart New Results:{} / {}'.format(coffees_entered, total_coffees)) logging.info('Heart Updated Results:{} / {}'.format(coffees_updated, total_coffees)) if error_coffees: logging.warning('Heart Error coffees are: {}'.format(error_coffees))
def scrape_stumptown(): roaster = 'Stumptown' stumptown = 'https://www.stumptowncoffee.com/coffee' base_url = 'https://www.stumptowncoffee.com' r = requests.get(stumptown) soup = BeautifulSoup(r.content, "html.parser") coffees_for_sale = soup.select('a.product-grid._link') total_coffees = len(coffees_for_sale) coffees_entered = 0 coffees_updated = 0 error_coffees = [] ignored = "trio" for items in coffees_for_sale: url = items['href'] if ignored in url: total_coffees = total_coffees - 1 continue name, price, description, notes, region, active, size = [""] * 7 product_url = base_url + url logging.info("Getting url: {}".format(url)) r = requests.get(product_url) coffee_soup = BeautifulSoup(r.content, "html.parser") name = coffee_soup.h1.string.strip() price = float(coffee_soup.select_one('span.js-pdp-price').text) description = coffee_soup.select_one('div.product._description').p.text try: notes = coffee_soup.h3.string.replace('&', ',').lower().split(',') except AttributeError: # no notes found pass region = country_from_name(name) active = True if coffee_soup.h6: # its sold out active = False try: size = '{} oz'.format( re.findall( '\d+', coffee_soup.select_one('div.product._specs').find_all('p') [1].string)[0]) except Exception as e: logging.warn("Error while getting size for {} : {}".format( name, e)) image_url = coffee_soup.select_one( 'div.product._image span')['data-src'] image_content = requests.get(image_url).content coffee_data = { 'name': name, 'roaster': roaster, 'description': description, 'price': price, 'notes': notes, 'region': region, 'active': active, 'product_page': product_url, 'size': size, 'image': image_content } coffees_updated, coffees_entered, error_coffees = add_or_update_coffee( coffee_data, coffees_updated, coffees_entered, error_coffees) logging.info('Stumptown New Results:{} / {}'.format( coffees_entered, total_coffees)) logging.info('Stumptown Updated Results:{} / {}'.format( coffees_updated, total_coffees)) if error_coffees: logging.warning( 'Stumptown Error coffees are: {}'.format(error_coffees))
def scrape_heart(): roaster = 'Heart' heart_beans = 'https://heartcoffee.myshopify.com/collections/beans' heart_url = 'https://heartcoffee.myshopify.com' host = 'heartcoffee.myshopify.com' r = requests.get(heart_beans, headers={"Host": host}) soup = BeautifulSoup(r.content, "html.parser") all_coffees_for_sale = soup.find_all('a', {'class': 'grid__image'}) all_coffee_links = [] for coffee in all_coffees_for_sale: if not 'Subscription' in coffee.find('img')['alt']: all_coffee_links.append("{}{}".format(heart_url, coffee['href'])) total_coffees = len(all_coffee_links) coffees_entered = 0 coffees_updated = 0 error_coffees = [] for url in all_coffee_links: name, price, description, notes, region, active, size = [""] * 7 logging.info("Getting url: {}".format(url)) r = requests.get(url, headers={"Host": host}) coffee_soup = BeautifulSoup(r.content, "html.parser") blend = False active = True name = coffee_soup.h1.text.strip() if 'blend' in name.lower(): blend = True size_price = coffee_soup.find('option').text size = size_price.split(" - ")[0] if 'Sold Out' in size_price: active = False price = 0 else: price = float( size_price.split(" - ")[1].replace('USD', '').replace('$', '')) description = coffee_soup.find('div', { 'class': 'tab-content small' }).find('div', { 'id': 'tab1' }).text.encode('utf-8').strip() notes = coffee_soup.find('p', { 'class': 'small uppercase flavors' }).text.split(',') if not blend: region = country_from_name(name) # region = coffee_soup.find('div', {'class':'tab-content small'}).find('div',{'id': 'tab1'}).p.text.replace(u'Location:\xa0', '').replace('Location:', '').encode('utf-8') image_url = "http:{}".format( coffee_soup.select('div.slide')[0].find('img')['src']) image_content = requests.get(image_url).content coffee_data = { 'name': name, 'roaster': roaster, 'description': description, 'price': price, 'notes': notes, 'region': region, 'active': active, 'product_page': url, 'size': size, 'image': image_content } coffees_updated, coffees_entered, error_coffees = add_or_update_coffee( coffee_data, coffees_updated, coffees_entered, error_coffees) logging.info('Heart New Results:{} / {}'.format(coffees_entered, total_coffees)) logging.info('Heart Updated Results:{} / {}'.format( coffees_updated, total_coffees)) if error_coffees: logging.warning('Heart Error coffees are: {}'.format(error_coffees))
def scrape_bluebottle(): roaster = 'Blue Bottle' bluebottle = 'https://bluebottlecoffee.com/store/coffee' r = requests.get(bluebottle) print(r) soup = BeautifulSoup(r.content, "html.parser") coffees_for_sale = soup.select('h2.ma0') total_coffees = len(coffees_for_sale) coffees_entered = 0 coffees_updated = 0 error_coffees = [] ignored = ['Box', 'Kit', 'Subscriptions', 'at Home', 'Pack'] for item in coffees_for_sale: name, description, notes, region, active, size, product_url = [""] * 7 price = float() name = item.string if any(word in name for word in ignored): total_coffees -= 1 else: url = item.a['href'] product_url = 'https://bluebottlecoffee.com' + url logging.info("Getting url: {}".format(url)) r = requests.get(product_url) coffee_soup = BeautifulSoup(r.content, "html.parser") active = True price = float( coffee_soup.find('span', { 'class': 'js-variant-price' }).string) try: description = coffee_soup.find('p', { 'class': 'spec-details' }).string except AttributeError: description = coffee_soup.find('p', { 'class': 'spec-overview' }).string notes = coffee_soup.select('div.mb30')[0].string.split(',') # only works for not single origin region = country_from_name(name) try: details = coffee_soup.find('p', { 'class': 'spec-details' }).contents[0].strip() if country_from_name(details) != '' and len(details) < 10: region = details except AttributeError: # if it's an espresso, then it's okay to not have region if 'Espresso' in name: region = "" try: size = coffee_soup.find('div', {'class': 'grid-col-4'}).text except AttributeError: continue image_url = coffee_soup.img['src'] image_content = requests.get(image_url).content coffee_data = { 'name': name, 'roaster': roaster, 'description': description, 'price': price, 'notes': notes, 'region': region, 'active': active, 'product_page': product_url, 'size': size, 'image': image_content } coffees_updated, coffees_entered, error_coffees = add_or_update_coffee( coffee_data, coffees_updated, coffees_entered, error_coffees) logging.info('Blue Bottle New Results:{} / {}'.format( coffees_entered, total_coffees)) logging.info('Blue Bottle Updated Results:{} / {}'.format( coffees_updated, total_coffees)) if error_coffees: logging.warning( 'Blue Bottle Error coffees are: {}'.format(error_coffees))