def scrape_bluebottle():
    roaster = 'Blue Bottle'
    bluebottle = 'https://bluebottlecoffee.com/store/coffee'
    r = requests.get(bluebottle)
    soup = BeautifulSoup(r.content, "html.parser")
    coffees_for_sale = soup.find_all('h2', {'class':'f5 lh-title man'})
    total_coffees = len(coffees_for_sale)
    coffees_entered = 0
    coffees_updated = 0
    error_coffees = []
    ignored = ['Box', 'Kit', 'Subscriptions', 'at Home']
    for item in coffees_for_sale:
        name,description,notes,region,active,size, product_url = [""]*7
        price = float()
        name = item.string
        if any(word in name for word in ignored):
            total_coffees -= 1
        else:
            url = item.a['href']
            product_url = 'https://bluebottlecoffee.com' + url
            logging.info("Getting url: {}".format(url))
            r = requests.get(product_url)
            coffee_soup = BeautifulSoup(r.content, "html.parser")
            active = True
            price = float(coffee_soup.find('span', {'class':'js-variant-price'}).string)
            description = coffee_soup.find('p', {'class':'spec-overview'}).string
            notes = coffee_soup.p.string.lower().split(',')
            # only works for not single origin
            region = country_from_name(name)
            try:
                details = coffee_soup.find('p', {'class':'spec-details'}).contents[0].strip()
                if country_from_name(details) != '':
                    region = details
            except AttributeError:
                # if it's an espresso, then it's okay to not have region
                if 'Espresso' in name:
                    region = ""
            size = coffee_soup.find('select', {'id':'cart_item_model_id'}).option.string.split('Bag')[0]
            image_url = coffee_soup.img['src']
            image_content = requests.get(image_url).content
            coffee_data = {'name': name, 'roaster': roaster, 'description': description, 'price': price, 'notes': notes, 'region': region, 'active': active, 'product_page': product_url, 'size': size, 'image': image_content}
            coffees_updated, coffees_entered, error_coffees = add_or_update_coffee(coffee_data, coffees_updated, coffees_entered, error_coffees)

    logging.info('Blue Bottle New Results:{} / {}'.format(coffees_entered, total_coffees))
    logging.info('Blue Bottle Updated Results:{} / {}'.format(coffees_updated, total_coffees))
    if error_coffees:
        logging.warning('Blue Bottle Error coffees are: {}'.format(error_coffees))
示例#2
0
def scrape_stumptown():
    roaster = 'Stumptown'
    stumptown = 'https://www.stumptowncoffee.com/coffee'
    r = requests.get(stumptown)
    soup = BeautifulSoup(r.content, "html.parser")
    # class="product-grid _link"
    coffees_for_sale = soup.find_all('a', {'class':'product-grid _link'})
    # keeping track of how many coffees
    total_coffees = len(coffees_for_sale)
    coffees_entered = 0
    coffees_updated = 0
    error_coffees = []
    for items in coffees_for_sale:
        url = items['href']
        if not 'trio' in url:
            name,price,description,notes,region,active,size = [""]*7
            product_url = 'https://www.stumptowncoffee.com'+url
            logging.info("Getting url: {}".format(url))
            r = requests.get(product_url)
            coffee_soup = BeautifulSoup(r.content, "html.parser")
            # product name h1 class="product _title -desktop theme-color js-pdp-title"
            name = coffee_soup.h1.string.strip()
            try:
                price = float(coffee_soup.find_all('span',{'class':'js-pdp-price'})[0].string)
            except IndexError as e:
                logging.warn("Error while getting price for {} : {}".format(name, e))
            # div class="product _description
            description = coffee_soup.find('div', {'class':'product _description'}).p.string
            try:
                notes = coffee_soup.h3.string.replace('&',',').lower().split(',')
            except AttributeError:
                # no notes found
                pass
            region = country_from_name(name)
            if coffee_soup.h6:
                # its sold out
                active = False
            else:
                active = True
            # size in ounces
            try:
                size = '{} oz'.format(re.findall('\d+', coffee_soup.find('div', {'class':'product _specs'}).find_all('p')[1].string)[0])
            except Exception as e:
                logging.warn("Error while getting size for {} : {}".format(name, e))
            image_url = coffee_soup.select('div.product._image')[0].find('span')['data-src']
            image_content = requests.get(image_url).content
            coffee_data = {'name': name, 'roaster': roaster, 'description': description, 'price': price, 'notes': notes, 'region': region, 'active': active, 'product_page': product_url, 'size': size, 'image': image_content}
            coffees_updated, coffees_entered, error_coffees = add_or_update_coffee(coffee_data, coffees_updated, coffees_entered, error_coffees)
        else:
            total_coffees -= 1

    logging.info('Stumptown New Results:{} / {}'.format(coffees_entered, total_coffees))
    logging.info('Stumptown Updated Results:{} / {}'.format(coffees_updated, total_coffees))
    if error_coffees:
        logging.warning('Stumptown Error coffees are: {}'.format(error_coffees))
示例#3
0
def scrape_heart():
    roaster = 'Heart'
    heart_beans = 'http://www.heartroasters.com/collections/beans'
    heart_url = 'http://www.heartroasters.com'

    r = requests.get(heart_beans)
    soup = BeautifulSoup(r.content, "html.parser")
    all_coffees_for_sale = soup.find_all('a', {'class':'grid__image'})
    all_coffee_links = []
    for coffee in all_coffees_for_sale:
        if not 'Subscription' in coffee.find('img')['alt']:
            all_coffee_links.append("{}{}".format(heart_url, coffee['href']))
    total_coffees = len(all_coffee_links)
    coffees_entered = 0
    coffees_updated = 0
    error_coffees = []
    for url in all_coffee_links:
        name,price,description,notes,region,active,size = [""] * 7
        logging.info("Getting url: {}".format(url))
        r = requests.get(url)
        coffee_soup = BeautifulSoup(r.content, "html.parser")
        blend = False
        active = True
        name = coffee_soup.h1.text.strip()
        if 'blend' in name.lower():
            blend = True
        size_price = coffee_soup.find('option').text
        size = size_price.split(" - ")[0]
        if 'Sold Out' in size_price:
            active = False
            price = 0
        else:
            price = float(size_price.split(" - ")[1].replace('USD', '').replace('$', ''))
        description = coffee_soup.find('div', {'class':'tab-content small'}).find('div',{'id': 'tab1'}).text.encode('utf-8').strip()
        notes = coffee_soup.find('p',{'class': 'small uppercase flavors'}).text.split(',')
        if not blend:
            region = country_from_name(name)
            # region = coffee_soup.find('div', {'class':'tab-content small'}).find('div',{'id': 'tab1'}).p.text.replace(u'Location:\xa0', '').replace('Location:', '').encode('utf-8')
        image_url = "http:{}".format(coffee_soup.select('div.slide')[0].find('img')['src'])
        image_content = requests.get(image_url).content
        coffee_data = {'name': name, 'roaster': roaster, 'description': description, 'price': price, 'notes': notes, 'region': region, 'active': active, 'product_page': url, 'size': size, 'image': image_content}
        coffees_updated, coffees_entered, error_coffees = add_or_update_coffee(coffee_data, coffees_updated, coffees_entered, error_coffees)

    logging.info('Heart New Results:{} / {}'.format(coffees_entered, total_coffees))
    logging.info('Heart Updated Results:{} / {}'.format(coffees_updated, total_coffees))
    if error_coffees:
        logging.warning('Heart Error coffees are: {}'.format(error_coffees))
示例#4
0
def scrape_stumptown():
    roaster = 'Stumptown'
    stumptown = 'https://www.stumptowncoffee.com/coffee'
    base_url = 'https://www.stumptowncoffee.com'

    r = requests.get(stumptown)
    soup = BeautifulSoup(r.content, "html.parser")
    coffees_for_sale = soup.select('a.product-grid._link')
    total_coffees = len(coffees_for_sale)
    coffees_entered = 0
    coffees_updated = 0
    error_coffees = []
    ignored = "trio"
    for items in coffees_for_sale:
        url = items['href']
        if ignored in url:
            total_coffees = total_coffees - 1
            continue

        name, price, description, notes, region, active, size = [""] * 7

        product_url = base_url + url
        logging.info("Getting url: {}".format(url))
        r = requests.get(product_url)
        coffee_soup = BeautifulSoup(r.content, "html.parser")

        name = coffee_soup.h1.string.strip()
        price = float(coffee_soup.select_one('span.js-pdp-price').text)
        description = coffee_soup.select_one('div.product._description').p.text
        try:
            notes = coffee_soup.h3.string.replace('&', ',').lower().split(',')
        except AttributeError:  # no notes found
            pass
        region = country_from_name(name)
        active = True
        if coffee_soup.h6:  # its sold out
            active = False
        try:
            size = '{} oz'.format(
                re.findall(
                    '\d+',
                    coffee_soup.select_one('div.product._specs').find_all('p')
                    [1].string)[0])
        except Exception as e:
            logging.warn("Error while getting size for {} : {}".format(
                name, e))
        image_url = coffee_soup.select_one(
            'div.product._image span')['data-src']
        image_content = requests.get(image_url).content
        coffee_data = {
            'name': name,
            'roaster': roaster,
            'description': description,
            'price': price,
            'notes': notes,
            'region': region,
            'active': active,
            'product_page': product_url,
            'size': size,
            'image': image_content
        }
        coffees_updated, coffees_entered, error_coffees = add_or_update_coffee(
            coffee_data, coffees_updated, coffees_entered, error_coffees)

    logging.info('Stumptown New Results:{} / {}'.format(
        coffees_entered, total_coffees))
    logging.info('Stumptown Updated Results:{} / {}'.format(
        coffees_updated, total_coffees))
    if error_coffees:
        logging.warning(
            'Stumptown Error coffees are: {}'.format(error_coffees))
示例#5
0
def scrape_heart():
    roaster = 'Heart'
    heart_beans = 'https://heartcoffee.myshopify.com/collections/beans'
    heart_url = 'https://heartcoffee.myshopify.com'
    host = 'heartcoffee.myshopify.com'

    r = requests.get(heart_beans, headers={"Host": host})
    soup = BeautifulSoup(r.content, "html.parser")
    all_coffees_for_sale = soup.find_all('a', {'class': 'grid__image'})
    all_coffee_links = []
    for coffee in all_coffees_for_sale:
        if not 'Subscription' in coffee.find('img')['alt']:
            all_coffee_links.append("{}{}".format(heart_url, coffee['href']))
    total_coffees = len(all_coffee_links)
    coffees_entered = 0
    coffees_updated = 0
    error_coffees = []
    for url in all_coffee_links:
        name, price, description, notes, region, active, size = [""] * 7
        logging.info("Getting url: {}".format(url))
        r = requests.get(url, headers={"Host": host})
        coffee_soup = BeautifulSoup(r.content, "html.parser")
        blend = False
        active = True
        name = coffee_soup.h1.text.strip()
        if 'blend' in name.lower():
            blend = True
        size_price = coffee_soup.find('option').text
        size = size_price.split(" - ")[0]
        if 'Sold Out' in size_price:
            active = False
            price = 0
        else:
            price = float(
                size_price.split(" - ")[1].replace('USD', '').replace('$', ''))
        description = coffee_soup.find('div', {
            'class': 'tab-content small'
        }).find('div', {
            'id': 'tab1'
        }).text.encode('utf-8').strip()
        notes = coffee_soup.find('p', {
            'class': 'small uppercase flavors'
        }).text.split(',')
        if not blend:
            region = country_from_name(name)
            # region = coffee_soup.find('div', {'class':'tab-content small'}).find('div',{'id': 'tab1'}).p.text.replace(u'Location:\xa0', '').replace('Location:', '').encode('utf-8')
        image_url = "http:{}".format(
            coffee_soup.select('div.slide')[0].find('img')['src'])
        image_content = requests.get(image_url).content
        coffee_data = {
            'name': name,
            'roaster': roaster,
            'description': description,
            'price': price,
            'notes': notes,
            'region': region,
            'active': active,
            'product_page': url,
            'size': size,
            'image': image_content
        }
        coffees_updated, coffees_entered, error_coffees = add_or_update_coffee(
            coffee_data, coffees_updated, coffees_entered, error_coffees)

    logging.info('Heart New Results:{} / {}'.format(coffees_entered,
                                                    total_coffees))
    logging.info('Heart Updated Results:{} / {}'.format(
        coffees_updated, total_coffees))
    if error_coffees:
        logging.warning('Heart Error coffees are: {}'.format(error_coffees))
def scrape_bluebottle():
    roaster = 'Blue Bottle'
    bluebottle = 'https://bluebottlecoffee.com/store/coffee'
    r = requests.get(bluebottle)
    print(r)
    soup = BeautifulSoup(r.content, "html.parser")
    coffees_for_sale = soup.select('h2.ma0')
    total_coffees = len(coffees_for_sale)
    coffees_entered = 0
    coffees_updated = 0
    error_coffees = []
    ignored = ['Box', 'Kit', 'Subscriptions', 'at Home', 'Pack']
    for item in coffees_for_sale:
        name, description, notes, region, active, size, product_url = [""] * 7
        price = float()
        name = item.string
        if any(word in name for word in ignored):
            total_coffees -= 1
        else:
            url = item.a['href']
            product_url = 'https://bluebottlecoffee.com' + url
            logging.info("Getting url: {}".format(url))
            r = requests.get(product_url)
            coffee_soup = BeautifulSoup(r.content, "html.parser")
            active = True
            price = float(
                coffee_soup.find('span', {
                    'class': 'js-variant-price'
                }).string)
            try:
                description = coffee_soup.find('p', {
                    'class': 'spec-details'
                }).string
            except AttributeError:
                description = coffee_soup.find('p', {
                    'class': 'spec-overview'
                }).string
            notes = coffee_soup.select('div.mb30')[0].string.split(',')
            # only works for not single origin
            region = country_from_name(name)
            try:
                details = coffee_soup.find('p', {
                    'class': 'spec-details'
                }).contents[0].strip()
                if country_from_name(details) != '' and len(details) < 10:
                    region = details
            except AttributeError:
                # if it's an espresso, then it's okay to not have region
                if 'Espresso' in name:
                    region = ""
            try:
                size = coffee_soup.find('div', {'class': 'grid-col-4'}).text
            except AttributeError:
                continue
            image_url = coffee_soup.img['src']
            image_content = requests.get(image_url).content
            coffee_data = {
                'name': name,
                'roaster': roaster,
                'description': description,
                'price': price,
                'notes': notes,
                'region': region,
                'active': active,
                'product_page': product_url,
                'size': size,
                'image': image_content
            }
            coffees_updated, coffees_entered, error_coffees = add_or_update_coffee(
                coffee_data, coffees_updated, coffees_entered, error_coffees)

    logging.info('Blue Bottle New Results:{} / {}'.format(
        coffees_entered, total_coffees))
    logging.info('Blue Bottle Updated Results:{} / {}'.format(
        coffees_updated, total_coffees))
    if error_coffees:
        logging.warning(
            'Blue Bottle Error coffees are: {}'.format(error_coffees))