예제 #1
0
def scrape_matchstick():
    roaster = "Matchstick"
    r = requests.get('http://www.matchstickcoffee.com/coffee/')
    soup = BeautifulSoup(r.content)
    coffees_for_sale = soup.find_all('div', {'class':'type-post'})
    total_coffees = len(coffees_for_sale)
    coffees_entered = 0
    coffees_updated = 0
    error_coffees = []
    for item in coffees_for_sale:
        name,description,notes,region,active,size, product_url = [""]*7
        price = float()
        url = item.a['href']   
        region = item.find(text='Origin:').next_element.strip()
        noteloc = item.find(text='Notes:').next_element
        notes = [x.strip() for x in noteloc.split(',')]
        price_and_size = noteloc.next_element.next_element.text.split(' / ')
        price = float(price_and_size[0][1:])
        size = price_and_size[1]
        active = True
        coffee_soup = BeautifulSoup(requests.get(url).content)
        name = coffee_soup.h1.string
        # not sure if the descriptions here matter at all
        description = coffee_soup.find(text='Notes:').next_element.next_element.next_element.next_element.next_element.next_element.next_element
        # url may had unicode stuff
        image_url = item.find('img')['src']
        image_content = requests.get(image_url).content
        coffee_data = {'name':name, 'roaster':roaster, 'description':description, 'price':price, 'notes':notes, 'region':region, 'active':active, 'product_page':product_url, 'size':size, 'image': image_content}
        coffees_updated, coffees_entered, error_coffees = add_or_update_coffee(coffee_data, coffees_updated, coffees_entered, error_coffees)
    
    logging.info('Matchstick New Results:{} / {}'.format(coffees_entered, total_coffees)) 
    logging.info('Matchstic Updated Results:{} / {}'.format(coffees_updated, total_coffees))
    if error_coffees:
        logging.warning('Matchstick Error coffees are: {}'.format(error_coffees))
예제 #2
0
def scrape_stumptown():
    roaster = 'Stumptown'
    stumptown = 'https://www.stumptowncoffee.com/coffee'
    r = requests.get(stumptown)
    soup = BeautifulSoup(r.content, "html.parser")
    # class="product-grid _link"
    coffees_for_sale = soup.find_all('a', {'class':'product-grid _link'})
    # keeping track of how many coffees
    total_coffees = len(coffees_for_sale)
    coffees_entered = 0
    coffees_updated = 0
    error_coffees = []
    for items in coffees_for_sale:
        url = items['href']
        if not 'trio' in url:
            name,price,description,notes,region,active,size = [""]*7
            product_url = 'https://www.stumptowncoffee.com'+url
            logging.info("Getting url: {}".format(url))
            r = requests.get(product_url)
            coffee_soup = BeautifulSoup(r.content, "html.parser")
            # product name h1 class="product _title -desktop theme-color js-pdp-title"
            name = coffee_soup.h1.string.strip()
            try:
                price = float(coffee_soup.find_all('span',{'class':'js-pdp-price'})[0].string)
            except IndexError as e:
                logging.warn("Error while getting price for {} : {}".format(name, e))
            # div class="product _description
            description = coffee_soup.find('div', {'class':'product _description'}).p.string
            try:
                notes = coffee_soup.h3.string.replace('&',',').lower().split(',')
            except AttributeError:
                # no notes found
                pass
            region = country_from_name(name)
            if coffee_soup.h6:
                # its sold out
                active = False
            else:
                active = True
            # size in ounces
            try:
                size = '{} oz'.format(re.findall('\d+', coffee_soup.find('div', {'class':'product _specs'}).find_all('p')[1].string)[0])
            except Exception as e:
                logging.warn("Error while getting size for {} : {}".format(name, e))
            image_url = coffee_soup.select('div.product._image')[0].find('span')['data-src']
            image_content = requests.get(image_url).content
            coffee_data = {'name': name, 'roaster': roaster, 'description': description, 'price': price, 'notes': notes, 'region': region, 'active': active, 'product_page': product_url, 'size': size, 'image': image_content}
            coffees_updated, coffees_entered, error_coffees = add_or_update_coffee(coffee_data, coffees_updated, coffees_entered, error_coffees)
        else:
            total_coffees -= 1

    logging.info('Stumptown New Results:{} / {}'.format(coffees_entered, total_coffees))
    logging.info('Stumptown Updated Results:{} / {}'.format(coffees_updated, total_coffees))
    if error_coffees:
        logging.warning('Stumptown Error coffees are: {}'.format(error_coffees))
예제 #3
0
def scrape_intelli():
    urlfetch.set_default_fetch_deadline(10)
    roaster = 'Intelligentsia'
    intelli = 'https://www.intelligentsiacoffee.com/catalog/ajax/products/?filter%5Bcat%5D=5'
    r = requests.get(intelli)
    soup = BeautifulSoup(r.content, "html.parser")
    x = r.json()

    total_coffees = len(x['data'])
    coffees_entered = 0
    coffees_updated = 0
    error_coffees = []
    for item in x['data']:
        name, description, notes, region, active, size, product_url = [""] * 7
        price = int()
        product_url = item['productUrl']
        logging.info("Getting url: {}".format(product_url))
        try:
            notes = item['flavor_profile_text'].split(',')
        except KeyError:
            notes = [""]
        name = item['original_name']
        description = item['description']
        region = item['country']
        price = float(item['price'])
        size = '12oz'
        active = True
        image_url = 'https://www.intelligentsiacoffee.com/media/catalog/product' + item[
            'small_image']
        image_blob = requests.get(image_url).content
        coffee_data = {
            'name': name,
            'roaster': roaster,
            'description': description,
            'price': price,
            'notes': notes,
            'region': region,
            'active': active,
            'product_page': product_url,
            'size': size,
            'image': image_blob
        }
        coffees_updated, coffees_entered, error_coffees = add_or_update_coffee(
            coffee_data, coffees_updated, coffees_entered, error_coffees)

    logging.info('Intelligentsia New Results:{} / {}'.format(
        coffees_entered, total_coffees))
    logging.info('Intelligentsia Updated Results:{} / {}'.format(
        coffees_updated, total_coffees))
    if error_coffees:
        logging.warning(
            'Intelligensia Error coffees are: {}'.format(error_coffees))
예제 #4
0
def scrape_bluebottle():
    roaster = 'Blue Bottle'
    bluebottle = 'https://bluebottlecoffee.com/store/coffee'
    r = requests.get(bluebottle)
    soup = BeautifulSoup(r.content, "html.parser")
    coffees_for_sale = soup.find_all('h2', {'class':'f5 lh-title man'})
    total_coffees = len(coffees_for_sale)
    coffees_entered = 0
    coffees_updated = 0
    error_coffees = []
    ignored = ['Box', 'Kit', 'Subscriptions', 'at Home']
    for item in coffees_for_sale:
        name,description,notes,region,active,size, product_url = [""]*7
        price = float()
        name = item.string
        if any(word in name for word in ignored):
            total_coffees -= 1
        else:
            url = item.a['href']
            product_url = 'https://bluebottlecoffee.com' + url
            logging.info("Getting url: {}".format(url))
            r = requests.get(product_url)
            coffee_soup = BeautifulSoup(r.content, "html.parser")
            active = True
            price = float(coffee_soup.find('span', {'class':'js-variant-price'}).string)
            description = coffee_soup.find('p', {'class':'spec-overview'}).string
            notes = coffee_soup.p.string.lower().split(',')
            # only works for not single origin
            region = country_from_name(name)
            try:
                details = coffee_soup.find('p', {'class':'spec-details'}).contents[0].strip()
                if country_from_name(details) != '':
                    region = details
            except AttributeError:
                # if it's an espresso, then it's okay to not have region
                if 'Espresso' in name:
                    region = ""
            size = coffee_soup.find('select', {'id':'cart_item_model_id'}).option.string.split('Bag')[0]
            image_url = coffee_soup.img['src']
            image_content = requests.get(image_url).content
            coffee_data = {'name': name, 'roaster': roaster, 'description': description, 'price': price, 'notes': notes, 'region': region, 'active': active, 'product_page': product_url, 'size': size, 'image': image_content}
            coffees_updated, coffees_entered, error_coffees = add_or_update_coffee(coffee_data, coffees_updated, coffees_entered, error_coffees)

    logging.info('Blue Bottle New Results:{} / {}'.format(coffees_entered, total_coffees))
    logging.info('Blue Bottle Updated Results:{} / {}'.format(coffees_updated, total_coffees))
    if error_coffees:
        logging.warning('Blue Bottle Error coffees are: {}'.format(error_coffees))
예제 #5
0
def scrape_heart():
    roaster = 'Heart'
    heart_beans = 'http://www.heartroasters.com/collections/beans'
    heart_url = 'http://www.heartroasters.com'

    r = requests.get(heart_beans)
    soup = BeautifulSoup(r.content, "html.parser")
    all_coffees_for_sale = soup.find_all('a', {'class':'grid__image'})
    all_coffee_links = []
    for coffee in all_coffees_for_sale:
        if not 'Subscription' in coffee.find('img')['alt']:
            all_coffee_links.append("{}{}".format(heart_url, coffee['href']))
    total_coffees = len(all_coffee_links)
    coffees_entered = 0
    coffees_updated = 0
    error_coffees = []
    for url in all_coffee_links:
        name,price,description,notes,region,active,size = [""] * 7
        logging.info("Getting url: {}".format(url))
        r = requests.get(url)
        coffee_soup = BeautifulSoup(r.content, "html.parser")
        blend = False
        active = True
        name = coffee_soup.h1.text.strip()
        if 'blend' in name.lower():
            blend = True
        size_price = coffee_soup.find('option').text
        size = size_price.split(" - ")[0]
        if 'Sold Out' in size_price:
            active = False
            price = 0
        else:
            price = float(size_price.split(" - ")[1].replace('USD', '').replace('$', ''))
        description = coffee_soup.find('div', {'class':'tab-content small'}).find('div',{'id': 'tab1'}).text.encode('utf-8').strip()
        notes = coffee_soup.find('p',{'class': 'small uppercase flavors'}).text.split(',')
        if not blend:
            region = country_from_name(name)
            # region = coffee_soup.find('div', {'class':'tab-content small'}).find('div',{'id': 'tab1'}).p.text.replace(u'Location:\xa0', '').replace('Location:', '').encode('utf-8')
        image_url = "http:{}".format(coffee_soup.select('div.slide')[0].find('img')['src'])
        image_content = requests.get(image_url).content
        coffee_data = {'name': name, 'roaster': roaster, 'description': description, 'price': price, 'notes': notes, 'region': region, 'active': active, 'product_page': url, 'size': size, 'image': image_content}
        coffees_updated, coffees_entered, error_coffees = add_or_update_coffee(coffee_data, coffees_updated, coffees_entered, error_coffees)

    logging.info('Heart New Results:{} / {}'.format(coffees_entered, total_coffees))
    logging.info('Heart Updated Results:{} / {}'.format(coffees_updated, total_coffees))
    if error_coffees:
        logging.warning('Heart Error coffees are: {}'.format(error_coffees))
예제 #6
0
def scrape_fortyninth():
    roaster = '49th Parallel'
    base_url = 'http://49thcoffee.com/collections/coffee'
    r = requests.get(base_url)
    soup = BeautifulSoup(r.content)
    coffees_for_sale = soup.find_all('li', {'class':'product-listing'})
    total_coffees = len(coffees_for_sale)
    coffees_entered = 0
    coffees_updated = 0
    error_coffees = []
    ignored = ['Subscription']
    for item in coffees_for_sale:
        name,description,notes,region,active,size, product_url = [""]*7
        price = float()
        name = item.h1.string
        if any(word in name for word in ignored):
            total_coffees -= 1
        else:
            url = item.a['href']
            product_url = 'http://49thcoffee.com' + url
            logging.info("Getting url: {}".format(product_url))
            r = requests.get(product_url)
            coffee_soup = BeautifulSoup(r.content)
            # logging.info("Title: {}".format(coffee_soup.title))
            details = coffee_soup.find('div', itemprop='description')
            d = details.p
            for sentence in d:
                description += sentence.string
            notes = details.h3.string.lower()
            notes = notes.split(' // ')
            region = coffee_soup.find('li', {'class':'product-detail-country'}).string.split()[1]
            size = item.find('data', {'class':'product-size'}).string.strip()
            price = float(item.find('data', {'class':'product-price'}).string[1:])
            active = True
            image_url = 'https:' + coffee_soup.find('meta', itemprop='image')['content']
            image_content = requests.get(image_url).content
            coffee_data = {'name': name, 'roaster': roaster, 'description': description, 'price': price, 'notes': notes, 'region': region, 'active': active, 'product_page': product_url, 'size': size, 'image': image_content}
            coffees_updated, coffees_entered, error_coffees = add_or_update_coffee(coffee_data, coffees_updated, coffees_entered, error_coffees)

    logging.info('49 Parallel New Results:{} / {}'.format(coffees_entered, total_coffees))
    logging.info('49 Parallel  Updated Results:{} / {}'.format(coffees_updated, total_coffees))
    if error_coffees:
        logging.warning('49 Paralell Error coffees are: {}'.format(error_coffees))
예제 #7
0
def scrape_intelli():
    urlfetch.set_default_fetch_deadline(10)
    roaster = 'Intelligentsia'
    intelli = 'http://www.intelligentsiacoffee.com/products/coffee'
    r = requests.get(intelli)
    soup = BeautifulSoup(r.content, "html.parser")

    # each coffee under class="grid_4 node node-type-product-coffee node-teaser build-mode-teaser""
    coffees_for_sale = soup.find_all('div', {'class': 'node-type-product-coffee'})
    # there are duplicates, must check
    seen = set()
    uniq_coffees_for_sale = []
    for x in coffees_for_sale:
        if x not in seen:
            uniq_coffees_for_sale.append(x)
            seen.add(x)
    total_coffees = len(uniq_coffees_for_sale)
    coffees_entered = 0
    coffees_updated = 0
    error_coffees = []
    for item in uniq_coffees_for_sale:
        name, description, notes, region, active, size, product_url = [""] * 7
        price = int()
        product_url = 'http://www.intelligentsiacoffee.com' + item.a['href']
        logging.info("Getting url: {}".format(product_url))
        notes_list = item.p.contents
        notes = [notes_list[2].strip().lower(),notes_list[4].strip().lower(),notes_list[6].strip().lower()]
        name = item.find('div', {'class': 'productListingDescBox'}).strong.string
        r = requests.get(product_url)
        coffee_soup = BeautifulSoup(r.content, "html.parser")
        try:
            price = float(coffee_soup.find('p', {'class': 'coffeeDetailPrice'}).em.string[1:])
            # size gives value + unit
            size = coffee_soup.find('p', {'class': 'coffeeDetailPrice'}).em.next_sibling.strip()[2:]
            active = True
        except AttributeError:
            logging.info("no price or size for: {}".format(product_url))
            # if 'OUT' in coffee_soup.find('p', {'class': 'coffeeDetailPrice'}).string:
            # its sold out
            active = False
            pass
        blend_or_origin = coffee_soup.find_all('p', {'class': 'coffeeDetailExtraInfoHeader'})
        blend_or_origin = [x.string for x in blend_or_origin]
        # region + country
        try:
            region = coffee_soup.find(text='Country').next_element.string 
        except AttributeError:
            # check if it's a blend
            if 'Blend' in blend_or_origin:
                region = 'Blend'
            pass
        image_url = coffee_soup.find('div', {'class': 'productPhotoSlide'}).find('img')['src']
        image_blob = requests.get(image_url).content
        description = coffee_soup.find('div', {'class': 'product-body'}).string
        coffee_data = {'name':name, 'roaster':roaster, 'description':description, 'price':price, 'notes':notes, 'region':region, 'active':active, 'product_page':product_url, 'size':size, 'image': image_blob}
        coffees_updated, coffees_entered, error_coffees = add_or_update_coffee(coffee_data, coffees_updated, coffees_entered, error_coffees)

    logging.info('Intelligentsia New Results:{} / {}'.format(coffees_entered, total_coffees))
    logging.info('Intelligentsia Updated Results:{} / {}'.format(coffees_updated, total_coffees))
    if error_coffees:
        logging.warning('Intelligensia Error coffees are: {}'.format(error_coffees))
예제 #8
0
def scrape_stumptown():
    roaster = 'Stumptown'
    stumptown = 'https://www.stumptowncoffee.com/coffee'
    base_url = 'https://www.stumptowncoffee.com'

    r = requests.get(stumptown)
    soup = BeautifulSoup(r.content, "html.parser")
    coffees_for_sale = soup.select('a.product-grid._link')
    total_coffees = len(coffees_for_sale)
    coffees_entered = 0
    coffees_updated = 0
    error_coffees = []
    ignored = "trio"
    for items in coffees_for_sale:
        url = items['href']
        if ignored in url:
            total_coffees = total_coffees - 1
            continue

        name, price, description, notes, region, active, size = [""] * 7

        product_url = base_url + url
        logging.info("Getting url: {}".format(url))
        r = requests.get(product_url)
        coffee_soup = BeautifulSoup(r.content, "html.parser")

        name = coffee_soup.h1.string.strip()
        price = float(coffee_soup.select_one('span.js-pdp-price').text)
        description = coffee_soup.select_one('div.product._description').p.text
        try:
            notes = coffee_soup.h3.string.replace('&', ',').lower().split(',')
        except AttributeError:  # no notes found
            pass
        region = country_from_name(name)
        active = True
        if coffee_soup.h6:  # its sold out
            active = False
        try:
            size = '{} oz'.format(
                re.findall(
                    '\d+',
                    coffee_soup.select_one('div.product._specs').find_all('p')
                    [1].string)[0])
        except Exception as e:
            logging.warn("Error while getting size for {} : {}".format(
                name, e))
        image_url = coffee_soup.select_one(
            'div.product._image span')['data-src']
        image_content = requests.get(image_url).content
        coffee_data = {
            'name': name,
            'roaster': roaster,
            'description': description,
            'price': price,
            'notes': notes,
            'region': region,
            'active': active,
            'product_page': product_url,
            'size': size,
            'image': image_content
        }
        coffees_updated, coffees_entered, error_coffees = add_or_update_coffee(
            coffee_data, coffees_updated, coffees_entered, error_coffees)

    logging.info('Stumptown New Results:{} / {}'.format(
        coffees_entered, total_coffees))
    logging.info('Stumptown Updated Results:{} / {}'.format(
        coffees_updated, total_coffees))
    if error_coffees:
        logging.warning(
            'Stumptown Error coffees are: {}'.format(error_coffees))
예제 #9
0
def scrape_victrola():
    roaster = 'Victrola'
    victrola = 'http://www.victrolacoffee.com/collections/all-coffee-offerings'
    r = requests.get(victrola)
    soup = BeautifulSoup(r.content, "html.parser")
    coffees_for_sale = soup.find_all('a', {'class':'product-link'})
    total_coffees = len(coffees_for_sale)
    coffees_entered = 0
    coffees_updated = 0
    error_coffees = []
    for item in coffees_for_sale:
        name,description,notes,region,active,size, product_url = [""]*7
        price = int()
        url = item['href']
        product_url = 'http://www.victrolacoffee.com' + url
        logging.info("Getting url: {}".format(url))
        r = requests.get(product_url)
        coffee_soup = BeautifulSoup(r.content, "html.parser")
        name = coffee_soup.h2.string
        if 'Subscription' in name:
            total_coffees-=1
            continue
        if coffee_soup.find('div', {'class': 'select single'}).find('label').text != 'Size':
            total_coffees-=1
            continue
        try:
            size = coffee_soup.find('select').option.string[:4]
        except AttributeError:
            logging.info('Cannot find size for {}'.format(name))
            continue
        try:
            price = float(coffee_soup.find(itemprop='price').string.strip()[2:])
            active = True
        except AttributeError:
            # its sold out
            active = False
        d = coffee_soup.find('h4', {'class':'mobile'}).next_siblings
        if 'Blend' in name:
            # different stuff for blends
            notes = []
            region = ''
            for x in d:
                description += x.string.strip()
        else:
            # sometimes tasting notes just alone
            # sometimes they are in 'Flavor'
            # sometimes there are no tasting notes...
            flavor = coffee_soup(text=re.compile('Flavor:'))
            if flavor:
                notes = flavor[1].string.strip()[8:].rstrip(',').lower().split(',')
            else:
                try:
                    notes = coffee_soup.find(text="Tasting Notes").next_element.strip()[2:].rstrip(',').lower().split(',')
                except AttributeError:
                    # can't find any tasting notes
                    notes = []
                    logging.info('No tasting notes for {}'.format(product_url))
        image_url = coffee_soup.find('ul', {'class': 'bx-slider'}).find('img')['src']
        image_content = requests.get("http:{}".format(image_url)).content
        coffee_data = {'name':name, 'roaster':roaster, 'description':description, 'price':price, 'notes':notes, 'region':region, 'active':active, 'product_page':product_url, 'size':size, 'image': image_content}
        coffees_updated, coffees_entered, error_coffees = add_or_update_coffee(coffee_data, coffees_updated, coffees_entered, error_coffees)

    logging.info('Victrola New Results:{} / {}'.format(coffees_entered, total_coffees))
    logging.info('Victrola Updated Results:{} / {}'.format(coffees_updated, total_coffees))
    if error_coffees:
        logging.warning('Victrola Error coffees are: {}'.format(error_coffees))
예제 #10
0
def scrape_fortyninth():
    roaster = '49th Parallel'
    coffee_url = 'http://49th-parallel.myshopify.com/collections/coffee'
    base_url = 'https://49th-parallel.myshopify.com'
    
    r = requests.get(coffee_url)
    soup = BeautifulSoup(r.content, "html.parser")
    coffees_for_sale = soup.find_all('li', {'class': 'product-listing'})
    total_coffees = len(coffees_for_sale)
    coffees_entered = 0
    coffees_updated = 0
    error_coffees = []
    ignored = ['Subscription']
    for item in coffees_for_sale:
        name, description, notes, region, active, size, product_url = [""] * 7
        price = float()
        name = item.h1.string
        if any(word in name for word in ignored):
            total_coffees -= 1
        else:
            url = item.a['href']
            product_url = base_url + url
            logging.info("Getting url: {}".format(product_url))
            r = requests.get(product_url)
            coffee_soup = BeautifulSoup(r.content, "html.parser")
            # logging.info("Title: {}".format(coffee_soup.title))
            details = coffee_soup.find('div', itemprop='description')
            d = coffee_soup.find_all('p', {'class': 'p1'})
            if d == []:
                try:
                    description = details.p.string
                except AttributeError:
                    description = details.span.string
            else:
                description = d[0].string
            notes = details.h3.string.lower()
            notes = notes.split(' // ')
            region = coffee_soup.find(
                'li', {'class': 'product-detail-country'}).string.split()[1]
            size = item.find('data', {'class': 'product-size'}).string.strip()
            price = float(
                item.find('data', {'class': 'product-price'}).string[1:])
            active = True
            image_url = 'https:' + coffee_soup.find(
                'meta', itemprop='image')['content']
            image_content = requests.get(image_url).content
            coffee_data = {
                'name': name,
                'roaster': roaster,
                'description': description,
                'price': price,
                'notes': notes,
                'region': region,
                'active': active,
                'product_page': product_url,
                'size': size,
                'image': image_content
            }
            coffees_updated, coffees_entered, error_coffees = add_or_update_coffee(
                coffee_data, coffees_updated, coffees_entered, error_coffees)

    logging.info('49 Parallel New Results:{} / {}'.format(coffees_entered,
                                                          total_coffees))
    logging.info('49 Parallel  Updated Results:{} / {}'.format(coffees_updated,
                                                               total_coffees))
    if error_coffees:
        logging.warning('49 Paralell Error coffees are: {}'.format(
            error_coffees))
예제 #11
0
def scrape_victrola():
    roaster = 'Victrola'
    base_url = 'https://victrola.myshopify.com'
    victrola = 'https://victrola.myshopify.com/collections/all-coffee-offerings'

    r = requests.get(victrola)
    soup = BeautifulSoup(r.content, "html.parser")
    coffees_for_sale = soup.select('a.product-link')
    total_coffees = len(coffees_for_sale)
    coffees_entered = 0
    coffees_updated = 0
    error_coffees = []
    ignored = 'subscription'
    for item in coffees_for_sale:
        url = item['href']
        if ignored in url:
            total_coffees = total_coffees - 1
            continue
        name, description, notes, region, active, size, product_url = [""] * 7
        price = int()
        product_url = base_url + url
        logging.info("Getting url: {}".format(url))
        coffee_soup = BeautifulSoup(
            requests.get(product_url).content, "html.parser")
        if coffee_soup.find(
                'div',
            {'class': 'select single'}).find('label').text != 'Size':
            # its sold out?
            total_coffees = total_coffees - 1
            continue
        name = coffee_soup.h2.string
        try:
            size = coffee_soup.find('select').option.string.replace(" ",
                                                                    "")[:4]
        except AttributeError:
            logging.info('Cannot find size for {}'.format(name))
            continue
        active = False
        if coffee_soup.find(itemprop='price'):
            price = float(
                coffee_soup.find(itemprop='price').string.strip()[2:])
            active = True
        description_raw = coffee_soup.select_one('h4.mobile').next_siblings
        if 'Blend' in name:
            # different stuff for blends
            notes = []
            region = ''
            for x in description_raw:
                if x.string:
                    description += x.string.strip()
        else:
            # sometimes tasting notes just alone
            # sometimes they are in 'Flavor'
            # sometimes there are no tasting notes...
            flavor = coffee_soup(text=re.compile('Flavor:'))
            tasting_notes = coffee_soup.find(text="Tasting Notes")
            if flavor:
                notes = flavor[1].string.strip()[8:].rstrip(',').lower().split(
                    ',')
            elif tasting_notes:
                notes = coffee_soup.find(
                    text="Tasting Notes").next_element.strip()[2:].rstrip(
                        ',').lower().split(',')
            else:
                # can't find any tasting notes
                notes = []
                logging.info('No tasting notes for {}'.format(product_url))
        # slider image is too big so we're using the twitter one
        # image_url = coffee_soup.select_one('ul.bx-slider').select_one('img')['src']
        image_url = coffee_soup.find("meta", {"name": "twitter:image"})["content"]
        image_content = requests.get(image_url).content
        coffee_data = {
            'name': name,
            'roaster': roaster,
            'description': description,
            'price': price,
            'notes': notes,
            'region': region,
            'active': active,
            'product_page': product_url,
            'size': size,
            'image': image_content
        }
        coffees_updated, coffees_entered, error_coffees = add_or_update_coffee(
            coffee_data, coffees_updated, coffees_entered, error_coffees)

    logging.info('Victrola New Results:{} / {}'.format(coffees_entered,
                                                       total_coffees))
    logging.info('Victrola Updated Results:{} / {}'.format(coffees_updated,
                                                           total_coffees))
    if error_coffees:
        logging.warning('Victrola Error coffees are: {}'.format(error_coffees))
예제 #12
0
def scrape_heart():
    roaster = 'Heart'
    heart_beans = 'https://heartcoffee.myshopify.com/collections/beans'
    heart_url = 'https://heartcoffee.myshopify.com'
    host = 'heartcoffee.myshopify.com'

    r = requests.get(heart_beans, headers={"Host": host})
    soup = BeautifulSoup(r.content, "html.parser")
    all_coffees_for_sale = soup.find_all('a', {'class': 'grid__image'})
    all_coffee_links = []
    for coffee in all_coffees_for_sale:
        if not 'Subscription' in coffee.find('img')['alt']:
            all_coffee_links.append("{}{}".format(heart_url, coffee['href']))
    total_coffees = len(all_coffee_links)
    coffees_entered = 0
    coffees_updated = 0
    error_coffees = []
    for url in all_coffee_links:
        name, price, description, notes, region, active, size = [""] * 7
        logging.info("Getting url: {}".format(url))
        r = requests.get(url, headers={"Host": host})
        coffee_soup = BeautifulSoup(r.content, "html.parser")
        blend = False
        active = True
        name = coffee_soup.h1.text.strip()
        if 'blend' in name.lower():
            blend = True
        size_price = coffee_soup.find('option').text
        size = size_price.split(" - ")[0]
        if 'Sold Out' in size_price:
            active = False
            price = 0
        else:
            price = float(
                size_price.split(" - ")[1].replace('USD', '').replace('$', ''))
        description = coffee_soup.find('div', {
            'class': 'tab-content small'
        }).find('div', {
            'id': 'tab1'
        }).text.encode('utf-8').strip()
        notes = coffee_soup.find('p', {
            'class': 'small uppercase flavors'
        }).text.split(',')
        if not blend:
            region = country_from_name(name)
            # region = coffee_soup.find('div', {'class':'tab-content small'}).find('div',{'id': 'tab1'}).p.text.replace(u'Location:\xa0', '').replace('Location:', '').encode('utf-8')
        image_url = "http:{}".format(
            coffee_soup.select('div.slide')[0].find('img')['src'])
        image_content = requests.get(image_url).content
        coffee_data = {
            'name': name,
            'roaster': roaster,
            'description': description,
            'price': price,
            'notes': notes,
            'region': region,
            'active': active,
            'product_page': url,
            'size': size,
            'image': image_content
        }
        coffees_updated, coffees_entered, error_coffees = add_or_update_coffee(
            coffee_data, coffees_updated, coffees_entered, error_coffees)

    logging.info('Heart New Results:{} / {}'.format(coffees_entered,
                                                    total_coffees))
    logging.info('Heart Updated Results:{} / {}'.format(
        coffees_updated, total_coffees))
    if error_coffees:
        logging.warning('Heart Error coffees are: {}'.format(error_coffees))
예제 #13
0
def scrape_matchstick():
    roaster = "Matchstick"
    base_url = "https://matchstickcoffee80.myshopify.com"
    r = requests.get(
        "https://matchstickcoffee80.myshopify.com/collections/coffee/")
    soup = BeautifulSoup(r.content, "html.parser")

    coffees_for_sale = soup.select('div.productItem')
    total_coffees = len(coffees_for_sale)
    coffees_entered = 0
    coffees_updated = 0
    error_coffees = []
    ignored = 'subscription'

    for item in coffees_for_sale:
        if ignored in item.text.lower():
            total_coffees = total_coffees - 1
            continue

        name, description, notes, region, active, size, product_url = [""] * 7
        price = float()
        product_url = item.a['href']
        coffee_soup = BeautifulSoup(
            requests.get(base_url + product_url).content)
        name = coffee_soup.h1.text
        location_string = coffee_soup.find(text=re.compile('Location:'))
        region_string = coffee_soup.find(text=re.compile('Region:'))
        if 'text' in dir(location_string.next_element):
            location_str = location_string.next_element.text.strip()
        else:
            location_str = location_string.next_element.strip()
        if 'text' in dir(region_string.next_element):
            region_str = region_string.next_element.text.strip()
        else:
            region_str = region_string.next_element.strip()
        region = u"{} - {}".format(location_str, region_str)
        if coffee_soup.find(text=re.compile('Tasting Notes')):
            notes_string = coffee_soup.find(
                text=re.compile('Tasting Notes')).next_element
            notes = [note.strip() for note in notes_string.text.split(',')]
        else:
            notes = []
        price = float(
            coffee_soup.select_one('span#ProductPrice').text.strip().strip(
                '$'))
        size_container = coffee_soup.select_one('div.swatchBox')
        size_container.select_one('input[checked]')['value']
        active = True
        product_info = coffee_soup.select_one(
            'div.product-info') or coffee_soup.select_one('span.s1')
        if product_info.find('strong'):
            product_info.find('strong').decompose()
        description = product_info.text.strip()
        image_container = coffee_soup.select_one('div#ProductPhoto')
        image_url = 'http:' + image_container.find('img')['src']
        image_content = requests.get(image_url).content
        coffee_data = {
            'name': name,
            'roaster': roaster,
            'description': description,
            'price': price,
            'notes': notes,
            'region': region,
            'active': active,
            'product_page': base_url + product_url,
            'size': size,
            'image': image_content
        }
        coffees_updated, coffees_entered, error_coffees = add_or_update_coffee(
            coffee_data, coffees_updated, coffees_entered, error_coffees)

    logging.info('Matchstick New Results:{} / {}'.format(
        coffees_entered, total_coffees))
    logging.info('Matchstic Updated Results:{} / {}'.format(
        coffees_updated, total_coffees))
    if error_coffees:
        logging.warning(
            'Matchstick Error coffees are: {}'.format(error_coffees))
예제 #14
0
def scrape_bluebottle():
    roaster = 'Blue Bottle'
    bluebottle = 'https://bluebottlecoffee.com/store/coffee'
    r = requests.get(bluebottle)
    print(r)
    soup = BeautifulSoup(r.content, "html.parser")
    coffees_for_sale = soup.select('h2.ma0')
    total_coffees = len(coffees_for_sale)
    coffees_entered = 0
    coffees_updated = 0
    error_coffees = []
    ignored = ['Box', 'Kit', 'Subscriptions', 'at Home', 'Pack']
    for item in coffees_for_sale:
        name, description, notes, region, active, size, product_url = [""] * 7
        price = float()
        name = item.string
        if any(word in name for word in ignored):
            total_coffees -= 1
        else:
            url = item.a['href']
            product_url = 'https://bluebottlecoffee.com' + url
            logging.info("Getting url: {}".format(url))
            r = requests.get(product_url)
            coffee_soup = BeautifulSoup(r.content, "html.parser")
            active = True
            price = float(
                coffee_soup.find('span', {
                    'class': 'js-variant-price'
                }).string)
            try:
                description = coffee_soup.find('p', {
                    'class': 'spec-details'
                }).string
            except AttributeError:
                description = coffee_soup.find('p', {
                    'class': 'spec-overview'
                }).string
            notes = coffee_soup.select('div.mb30')[0].string.split(',')
            # only works for not single origin
            region = country_from_name(name)
            try:
                details = coffee_soup.find('p', {
                    'class': 'spec-details'
                }).contents[0].strip()
                if country_from_name(details) != '' and len(details) < 10:
                    region = details
            except AttributeError:
                # if it's an espresso, then it's okay to not have region
                if 'Espresso' in name:
                    region = ""
            try:
                size = coffee_soup.find('div', {'class': 'grid-col-4'}).text
            except AttributeError:
                continue
            image_url = coffee_soup.img['src']
            image_content = requests.get(image_url).content
            coffee_data = {
                'name': name,
                'roaster': roaster,
                'description': description,
                'price': price,
                'notes': notes,
                'region': region,
                'active': active,
                'product_page': product_url,
                'size': size,
                'image': image_content
            }
            coffees_updated, coffees_entered, error_coffees = add_or_update_coffee(
                coffee_data, coffees_updated, coffees_entered, error_coffees)

    logging.info('Blue Bottle New Results:{} / {}'.format(
        coffees_entered, total_coffees))
    logging.info('Blue Bottle Updated Results:{} / {}'.format(
        coffees_updated, total_coffees))
    if error_coffees:
        logging.warning(
            'Blue Bottle Error coffees are: {}'.format(error_coffees))