def fetch(): LogI("Fetching Coop discounts...") start_time = time.time() * 1000 root_url = 'http://www.coop.nl' index_url = root_url + '/aanbiedingen' try: response = requests.get(index_url, headers=settings.headers) except requests.exceptions.ConnectionError as ce: LogE("Failed to connect to '{0}'".format(index_url), "{0}".format(ce)) return try: soup = bs4.BeautifulSoup(response.text) soup.encode('utf-8') except: LogE("Unable to parse HTML", "{0}".format(sys.exc_info()[0])) return count = 0 failedcount = 0 totalexceptions = 0 category_divs = soup.findAll('div', {'class': 'deal'}) for div in category_divs: exceptioncount = 0 temp_data = {} temp_data = models.defaultModel.copy() temp_data['supermarket'] = 'coop' temp_data['url'] = index_url # PRODUCTNAME try: temp_data['productname'] = div.find('h3').get_text() except: LogE("[IGNORING] Productname not found", "{0}".format(sys.exc_info()[0])) exceptioncount = exceptioncount + 1 pass # DURATION try: temp_data['duration'] = soup.select( 'div#ctl00_ctl00_ContentPlaceHolderMain_cpLeftAndContent_Header2_divTextLink div.periode strong' )[0].get_text() + " t/m " + soup.select( 'div#ctl00_ctl00_ContentPlaceHolderMain_cpLeftAndContent_Header2_divTextLink div.periode strong' )[1].get_text() except IndexError as e: LogE("[IGNORING] Duration not found", "{0}".format(e)) exceptioncount = exceptioncount + 1 pass # IMAGE try: temp_data['image'] = root_url + div.find('img').get('src') except: LogE("[IGNORING] Image not found", "{0}".format(sys.exc_info()[0])) exceptioncount = exceptioncount + 1 pass # DESCRIPTION try: temp_data['description'] = '' except IndexError as e: LogE("[IGNORING] Description not found", "{0}".format(e)) exceptioncount = exceptioncount + 1 pass # AMOUNT try: temp_data['amount'] = ", ".join( str(info.get_text()) for info in div.select('div.deal-info ul li')) except IndexError as e: LogE("[IGNORING] Amount not found", "{0}".format(e)) exceptioncount = exceptioncount + 1 pass # ACTION PRICE tempActPrice = div.select('span.deal-euros') if tempActPrice: temp_data['action_price'] = tempActPrice[0].get_text() else: if div.select('div.i50procentkorting'): temp_data['action_price'] = "50% korting" elif div.select('div.i25procentkorting'): temp_data['action_price'] = "25% korting" elif div.select('div.i2halen1betalen'): temp_data['action_price'] = "2 halen, 1 betalen" else: LogE("[IGNORING] Action price not found", "{0}".format("Skipped all possible options")) exceptioncount = exceptioncount + 1 count = count + 1 totalexceptions = totalexceptions + exceptioncount if exceptioncount > settings.maxErrors: LogE("Too much missing info, skipping this discount", "{0} Errors occured".format(exceptioncount)) failedcount = failedcount + 1 else: db.insert(temp_data) LogD("[{0}] ({1}) Fetched '{2}'".format(exceptioncount, count, temp_data['productname'])) if failedcount > settings.maxFailedDiscounts: LogE( "Skipping this supermarket, too much missing info.", "More than {0} discounts missing too much info".format( settings.maxFailedDiscounts)) LogI("Skipping this supermarket, too much missing info") LogI("More than {0} discounts missing too much info".format( settings.maxFailedDiscounts)) return seconds = (time.time() * 1000) - start_time LogI( "Done fetching {0} Coop discounts in {1}ms. {2} errors occured and ignored.\n" .format(count, format(seconds, '.2f'), totalexceptions))
def get_actie_data(actie_page_url): try: response = requests.get(actie_page_url, headers=settings.headers) except requests.exceptions.ConnectionError as ce: LogE("Failed to connect to '{0}'".format(index_url), "{0}".format(ce)) return try: soup = bs4.BeautifulSoup(response.text) soup.encode('utf-8') except: LogE("Unable to parse HTML", "{0}".format(sys.exc_info()[0])) return global count global failedcount global totalexceptions global duration category_divs = soup.findAll('div', {'class': 'aanbieding'}) for div in category_divs: exceptioncount = 0 temp_data = {} temp_data = models.defaultModel.copy() temp_data['supermarket'] = 'deka' temp_data['url'] = actie_page_url try: temp_data['productname'] = div.find('h2').get_text() except: LogE("[IGNORING] Productname not found", "{0}".format(sys.exc_info()[0])) exceptioncount = exceptioncount + 1 pass try: temp_data['duration'] = '' except: LogE("[IGNORING] Duration not found", "{0}".format(sys.exc_info()[0])) exceptioncount = exceptioncount + 1 pass try: temp_data['description'] = div.select('div.text')[0].get_text() except IndexError as e: LogE("[IGNORING] Description not found", "{0}".format(e)) exceptioncount = exceptioncount + 1 pass try: temp_data['image'] = root_url + div.find('img').get('src') except: LogE("[IGNORING] Image not found", "{0}".format(sys.exc_info()[0])) exceptioncount = exceptioncount + 1 pass try: temp_data['amount'] = getAmount( div.find('div', { 'class': re.compile("tag") }).get('class')[1].replace('tag', '')) except: LogE("[IGNORING] Amount not found", "{0}".format(sys.exc_info()[0])) exceptioncount = exceptioncount + 1 pass try: temp_data['action_price'] = div.select( 'span.current span.whole')[0].get_text() + div.select( 'span.current span.part')[0].get_text() except IndexError as e: LogE("[IGNORING] Action price not found", "{0}".format(e)) exceptioncount = exceptioncount + 1 pass try: temp_data['old_price'] = div.select('span.old span.whole')[ 0].get_text() + div.select('span.old span.part')[0].get_text() except IndexError as e: LogE("[IGNORING] Old price not found", "{0}".format(e)) exceptioncount = exceptioncount + 1 pass count = count + 1 totalexceptions = totalexceptions + exceptioncount if exceptioncount > settings.maxErrors: LogE("Too much missing info, skipping this discount", "{0} Errors occured".format(exceptioncount)) failedcount = failedcount + 1 else: db.insert(temp_data) LogD("[{0}] ({1}) Fetched '{2}'".format(exceptioncount, count, temp_data['productname'])) if failedcount > settings.maxFailedDiscounts: LogE( "Skipping this supermarket, too much missing info.", "More than {0} discounts missing too much info".format( settings.maxFailedDiscounts)) LogI("Skipping this supermarket, too much missing info") LogI("More than {0} discounts missing too much info".format( settings.maxFailedDiscounts)) return
def fetch(): LogI("Fetching AH discounts...") start_time = time.time() * 1000 index_url = 'http://www.ah.nl/bonus' try: r = requests.get(index_url, headers=settings.headers) except requests.exceptions.ConnectionError as ce: LogE("Failed to connect to '{0}'".format(index_url),"{0}".format(ce)) return try: soup = bs4.BeautifulSoup(r.text, 'html5lib') soup.encode('utf-8') except: LogE("Unable to parse HTML","{0}".format(sys.exc_info()[0])) return count = 0 failedcount = 0 totalexceptions = 0 bonus_products = soup.findAll(attrs={'data-class': 'product'}) for bonus in bonus_products: exceptioncount = 0 superdata = {} superdata = models.defaultModel.copy() superdata['supermarket'] = 'ah' # URL try: superdata['url'] = "http://www.ah.nl" + bonus.select('div.detail a')[0].get('href') superdata['url'] = "http://www.ah.nl" + bonus.get('href') except (IndexError, TypeError) as e: if superdata['url'] is None: LogE("[IGNORING] Error","{0}".format(e)) exceptioncount = exceptioncount + 1 pass # PRODUCTNAME try: superdata['productname'] = bonus.select('div.detail h2')[0].get_text().strip() except IndexError as e: LogE("[IGNORING] Productname not found","{0}".format(e)) exceptioncount = exceptioncount + 1 pass # DURATION try: #superdata['duration'] = soup.select('div.columns p.header-bar__term')[0].get_text() superdata['duration'] = 'This week' except IndexError as e: LogE("[IGNORING] Duration not found","{0}".format(e)) exceptioncount = exceptioncount + 1 pass # IMAGE try: superdata['image'] = bonus.select('div.image img')[0].get('data-original') except IndexError as e: LogE("[IGNORING] Image not found","{0}".format(e)) exceptioncount = exceptioncount + 1 pass # AMOUNT try: tempAmount = bonus.select('div.image p.unit')[0].get_text().strip() if tempAmount is not None: superdata['amount'] = tempAmount except IndexError as e: LogE("[IGNORING] Amount not found","{0}".format(e)) exceptioncount = exceptioncount + 1 pass # BONUS try: superdata['bonus'] = bonus.select('div.shield')[0].get_text().strip() except IndexError as e: LogE("[IGNORING] Bonus not found","{0}".format(e)) exceptioncount = exceptioncount + 1 pass # ACTION PRICE try: superdata['action_price'] = bonus.select('p.price ins')[0].get_text() except IndexError as e: LogE("[IGNORING] Action price not found","{0}".format(e)) exceptioncount = exceptioncount + 1 pass # OLD PRICE try: superdata['old_price'] = bonus.select('p.price del')[0].get_text() except IndexError as e: LogE("[IGNORING] Old price not found","{0}".format(e)) exceptioncount = exceptioncount + 1 pass count = count + 1 totalexceptions = totalexceptions + exceptioncount if exceptioncount > settings.maxErrors: LogE("Too much missing info, skipping this discount","{0} Errors occured".format(exceptioncount)) failedcount = failedcount + 1 else: db.insert(superdata) LogD("[{0}] ({1}) Fetched '{2}'".format(exceptioncount, count, superdata['productname'])) if failedcount > settings.maxFailedDiscounts: LogE("Skipping this supermarket, too much missing info.","More than {0} discounts missing too much info".format(settings.maxFailedDiscounts)) LogI("Skipping this supermarket, too much missing info") LogI("More than {0} discounts missing too much info".format(settings.maxFailedDiscounts)) return seconds = (time.time() * 1000) - start_time LogI("Done fetching {0} AH discounts in {1}ms. {2} errors occured and ignored.\n".format(count, format(seconds, '.2f'), totalexceptions))
def fetch(): LogI("Fetching Coop discounts...") start_time = time.time() * 1000 root_url = 'http://www.coop.nl' index_url = root_url + '/aanbiedingen' try: response = requests.get(index_url, headers=settings.headers) except requests.exceptions.ConnectionError as ce: LogE("Failed to connect to '{0}'".format(index_url),"{0}".format(ce)) return try: soup = bs4.BeautifulSoup(response.text) soup.encode('utf-8') except: LogE("Unable to parse HTML","{0}".format(sys.exc_info()[0])) return count = 0 failedcount = 0 totalexceptions = 0 category_divs = soup.findAll('div', {'class':'deal'}) for div in category_divs: exceptioncount = 0 temp_data = {} temp_data = models.defaultModel.copy() temp_data['supermarket'] = 'coop' temp_data['url'] = index_url # PRODUCTNAME try: temp_data['productname'] = div.find('h3').get_text() except: LogE("[IGNORING] Productname not found","{0}".format(sys.exc_info()[0])) exceptioncount = exceptioncount + 1 pass # DURATION try: temp_data['duration'] = soup.select('div#ctl00_ctl00_ContentPlaceHolderMain_cpLeftAndContent_Header2_divTextLink div.periode strong')[0].get_text() + " t/m " + soup.select('div#ctl00_ctl00_ContentPlaceHolderMain_cpLeftAndContent_Header2_divTextLink div.periode strong')[1].get_text() except IndexError as e: LogE("[IGNORING] Duration not found","{0}".format(e)) exceptioncount = exceptioncount + 1 pass # IMAGE try: temp_data['image'] = root_url + div.find('img').get('src') except: LogE("[IGNORING] Image not found","{0}".format(sys.exc_info()[0])) exceptioncount = exceptioncount + 1 pass # DESCRIPTION try: temp_data['description'] = '' except IndexError as e: LogE("[IGNORING] Description not found","{0}".format(e)) exceptioncount = exceptioncount + 1 pass # AMOUNT try: temp_data['amount'] = ", ".join(str(info.get_text()) for info in div.select('div.deal-info ul li')) except IndexError as e: LogE("[IGNORING] Amount not found","{0}".format(e)) exceptioncount = exceptioncount + 1 pass # ACTION PRICE tempActPrice = div.select('span.deal-euros') if tempActPrice: temp_data['action_price'] = tempActPrice[0].get_text() else: if div.select('div.i50procentkorting'): temp_data['action_price'] = "50% korting" elif div.select('div.i25procentkorting'): temp_data['action_price'] = "25% korting" elif div.select('div.i2halen1betalen'): temp_data['action_price'] = "2 halen, 1 betalen" else: LogE("[IGNORING] Action price not found","{0}".format("Skipped all possible options")) exceptioncount = exceptioncount + 1 count = count + 1 totalexceptions = totalexceptions + exceptioncount if exceptioncount > settings.maxErrors: LogE("Too much missing info, skipping this discount","{0} Errors occured".format(exceptioncount)) failedcount = failedcount + 1 else: db.insert(temp_data) LogD("[{0}] ({1}) Fetched '{2}'".format(exceptioncount, count, temp_data['productname'])) if failedcount > settings.maxFailedDiscounts: LogE("Skipping this supermarket, too much missing info.","More than {0} discounts missing too much info".format(settings.maxFailedDiscounts)) LogI("Skipping this supermarket, too much missing info") LogI("More than {0} discounts missing too much info".format(settings.maxFailedDiscounts)) return seconds = (time.time() * 1000) - start_time LogI("Done fetching {0} Coop discounts in {1}ms. {2} errors occured and ignored.\n".format(count, format(seconds, '.2f'), totalexceptions))
def fetch(): LogI("Fetching C1000 discounts...") start_time = time.time() * 1000 root_url = 'http://www.c1000.nl/' index_url = root_url + 'aanbiedingen' try: response = requests.get(index_url, headers=settings.headers) except requests.exceptions.ConnectionError as ce: LogE("Failed to connect to '{0}'".format(index_url), "{0}".format(ce)) return try: soup = bs4.BeautifulSoup(response.text) soup.encode('utf-8') except: LogE("Unable to parse HTML", "{0}".format(sys.exc_info()[0])) return count = 0 failedcount = 0 totalexceptions = 0 category_divs = soup.findAll( 'div', id=re.compile( "^content_0_contentrij1_0_weekAanbiedingen_listViewCategorieen_")) for div in category_divs: list_items = div.findAll('li') for li in list_items: exceptioncount = 0 temp_data = {} temp_data = models.defaultModel.copy() temp_data['supermarket'] = 'c1000' # URL temp_data['url'] = index_url # PRODUCTNAME try: temp_data['productname'] = li.find('h2').get_text().strip() except: LogE("[IGNORING] Productname not found", "{0}".format(sys.exc_info()[0])) exceptioncount = exceptioncount + 1 pass # DURATION try: temp_data['duration'] = re.sub( r'[\t\r\n]', '', soup.find( 'a', { 'id': 'content_0_contentrij1_0_linkTabHuidigeWeek' }).get_text()).strip().replace(' ', ' ') except: LogE("[IGNORING] Productname not found", "{0}".format(sys.exc_info()[0])) exceptioncount = exceptioncount + 1 pass # DESCRIPTION try: temp_data['description'] = re.sub( r'[\t\r\n]', '', li.select('div.product_details p')[0].get_text().strip()) except IndexError as e: LogE("[IGNORING] Description not found", "{0}".format(e)) exceptioncount = exceptioncount + 1 pass # IMAGE try: temp_data['image'] = root_url + li.find('img').get('src') except: LogE("[IGNORING] Image not found", "{0}".format(sys.exc_info()[0])) exceptioncount = exceptioncount + 1 pass # AMOUNT try: temp_data['amount'] = li.select( 'div.product_details p')[0].get_text().strip() temp_data['amount'] = li.select( 'div.pricetag em')[0].get_text().strip() except IndexError as e: if temp_data['amount'] is None: LogE("[IGNORING] Amount not found", "{0}".format(e)) exceptioncount = exceptioncount + 1 pass # ACTION PRICE # derp = li.select('div.pricetag strong') # if derp is not None: # derp = derp[0].get_text().strip() try: temp_data['action_price'] = li.select( 'div.pricetag strong')[0].get_text().strip() except (IndexError, TypeError) as e: try: temp_data['action_price'] = li.select('img.visual')[1].get( 'alt').strip() except IndexError as e: pass if temp_data['action_price'] is None: LogE("[IGNORING] Action price not found", "{0}".format(e)) exceptioncount = exceptioncount + 1 pass # OLD PRICE try: temp_data['old_price'] = li.select('del')[0].get_text().strip() except IndexError as e: LogE("[IGNORING] Old price not found", "{0}".format(e)) exceptioncount = exceptioncount + 1 pass count = count + 1 totalexceptions = totalexceptions + exceptioncount if exceptioncount > settings.maxErrors: LogE("Too much missing info, skipping this discount", "{0} Errors occured".format(exceptioncount)) failedcount = failedcount + 1 else: db.insert(temp_data) LogD("[{0}] ({1}) Fetched '{2}'".format( exceptioncount, count, temp_data['productname'])) if failedcount > settings.maxFailedDiscounts: LogE( "Skipping this supermarket, too much missing info.", "More than {0} discounts missing too much info".format( settings.maxFailedDiscounts)) LogI("Skipping this supermarket, too much missing info") LogI("More than {0} discounts missing too much info".format( settings.maxFailedDiscounts)) return seconds = (time.time() * 1000) - start_time LogI( "Done fetching {0} C1000 discounts in {1}ms. {2} errors occured and ignored.\n" .format(count, format(seconds, '.2f'), totalexceptions))
def fetch(): LogI("Fetching Poiesz discounts...") start_time = time.time() * 1000 index_url = 'http://www.poiesz-supermarkten.nl/aanbiedingen' try: r = requests.get(index_url, headers=settings.headers) except requests.exceptions.ConnectionError as ce: LogE("Failed to connect to '{0}'".format(index_url),"{0}".format(ce)) return try: soup = bs4.BeautifulSoup(r.text, 'html5lib') soup.encode('utf-8') except: LogE("Unable to parse HTML","{0}".format(sys.exc_info()[0])) return count = 0 failedcount = 0 totalexceptions = 0 discounts = soup.select('div.meevaller') for discount in discounts: exceptioncount = 0 superdata = {} superdata = models.defaultModel.copy() superdata['supermarket'] = 'poiesz' # URL try: superdata['url'] = index_url except (IndexError, TypeError) as e: if superdata['url'] is None: LogE("[IGNORING] Error","{0}".format(e)) exceptioncount = exceptioncount + 1 pass # PRODUCTNAME try: superdata['productname'] = discount.select('h2')[0].get_text().strip() except IndexError as e: LogE("[IGNORING] Productname not found","{0}".format(e)) exceptioncount = exceptioncount + 1 pass # DURATION try: superdata['duration'] = soup.select('div.validThrough')[0].get_text().strip() except IndexError as e: LogE("[IGNORING] Duration not found","{0}".format(e)) exceptioncount = exceptioncount + 1 pass # IMAGE try: superdata['image'] = "http://www.poiesz-supermarkten.nl/" + discount.select('img')[0].get('src') except IndexError as e: LogE("[IGNORING] Image not found","{0}".format(e)) exceptioncount = exceptioncount + 1 pass # AMOUNT try: tempAmount = discount.select('div.shieldNew div.top')[0].get_text().strip() tempAmount = ' '.join(tempAmount.split()) if tempAmount is not None: superdata['amount'] = tempAmount except IndexError as e: LogE("[IGNORING] Amount not found","{0}".format(e)) exceptioncount = exceptioncount + 1 pass # DESCRIPTION try: subtitles = discount.select('div.subtitle') for sub in subtitles: if sub.get_text() is not None: superdata['description'] = superdata['description'] + sub.get_text().strip() + ' ' except IndexError as e: LogE("[IGNORING] Description not found","{0}".format(e)) exceptioncount = exceptioncount + 1 pass # ACTION PRICE try: tempPrice = discount.select('div.forPrice div.whole')[0].get_text().strip() + "." + discount.select('div.forPrice div.part')[0].get_text().strip() superdata['action_price'] = ' '.join(tempPrice.split()) except IndexError as e: try: tempPrice = discount.select('div.forPrice div.combined')[0].get_text().strip() superdata['action_price'] = ' '.join(tempPrice.split()) except IndexError as ex: if superdata['action_price'] is None: LogE("[IGNORING] Action price not found","{0}".format(e)) exceptioncount = exceptioncount + 1 pass pass # OLD PRICE try: tempOld = discount.select('div.fromPriceWrap')[0].get_text().strip() superdata['old_price'] = ' '.join(tempOld.split()) except IndexError as e: LogE("[IGNORING] Old price not found","{0}".format(e)) exceptioncount = exceptioncount + 1 pass count = count + 1 totalexceptions = totalexceptions + exceptioncount if exceptioncount > settings.maxErrors: LogE("Too much missing info, skipping this discount","{0} Errors occured".format(exceptioncount)) failedcount = failedcount + 1 else: db.insert(superdata) LogD("[{0}] ({1}) Fetched '{2}'".format(exceptioncount, count, superdata['productname'])) if failedcount > settings.maxFailedDiscounts: LogE("Skipping this supermarket, too much missing info.","More than {0} discounts missing too much info".format(settings.maxFailedDiscounts)) LogI("Skipping this supermarket, too much missing info") LogI("More than {0} discounts missing too much info".format(settings.maxFailedDiscounts)) return seconds = (time.time() * 1000) - start_time LogI("Done fetching {0} Poiesz discounts in {1}ms. {2} errors occured and ignored.\n".format(count, format(seconds, '.2f'), totalexceptions))
def fetch(): LogI("Fetching AH discounts...") start_time = time.time() * 1000 index_url = 'http://www.ah.nl/bonus' try: r = requests.get(index_url, headers=settings.headers) except requests.exceptions.ConnectionError as ce: LogE("Failed to connect to '{0}'".format(index_url), "{0}".format(ce)) return try: soup = bs4.BeautifulSoup(r.text, 'html5lib') soup.encode('utf-8') except: LogE("Unable to parse HTML", "{0}".format(sys.exc_info()[0])) return count = 0 failedcount = 0 totalexceptions = 0 bonus_products = soup.findAll(attrs={'data-class': 'product'}) for bonus in bonus_products: exceptioncount = 0 superdata = {} superdata = models.defaultModel.copy() superdata['supermarket'] = 'ah' # URL try: superdata['url'] = "http://www.ah.nl" + bonus.select( 'div.detail a')[0].get('href') superdata['url'] = "http://www.ah.nl" + bonus.get('href') except (IndexError, TypeError) as e: if superdata['url'] is None: LogE("[IGNORING] Error", "{0}".format(e)) exceptioncount = exceptioncount + 1 pass # PRODUCTNAME try: superdata['productname'] = bonus.select( 'div.detail h2')[0].get_text().strip() except IndexError as e: LogE("[IGNORING] Productname not found", "{0}".format(e)) exceptioncount = exceptioncount + 1 pass # DURATION try: #superdata['duration'] = soup.select('div.columns p.header-bar__term')[0].get_text() superdata['duration'] = 'This week' except IndexError as e: LogE("[IGNORING] Duration not found", "{0}".format(e)) exceptioncount = exceptioncount + 1 pass # IMAGE try: superdata['image'] = bonus.select('div.image img')[0].get( 'data-original') except IndexError as e: LogE("[IGNORING] Image not found", "{0}".format(e)) exceptioncount = exceptioncount + 1 pass # AMOUNT try: tempAmount = bonus.select('div.image p.unit')[0].get_text().strip() if tempAmount is not None: superdata['amount'] = tempAmount except IndexError as e: LogE("[IGNORING] Amount not found", "{0}".format(e)) exceptioncount = exceptioncount + 1 pass # BONUS try: superdata['bonus'] = bonus.select( 'div.shield')[0].get_text().strip() except IndexError as e: LogE("[IGNORING] Bonus not found", "{0}".format(e)) exceptioncount = exceptioncount + 1 pass # ACTION PRICE try: superdata['action_price'] = bonus.select( 'p.price ins')[0].get_text() except IndexError as e: LogE("[IGNORING] Action price not found", "{0}".format(e)) exceptioncount = exceptioncount + 1 pass # OLD PRICE try: superdata['old_price'] = bonus.select('p.price del')[0].get_text() except IndexError as e: LogE("[IGNORING] Old price not found", "{0}".format(e)) exceptioncount = exceptioncount + 1 pass count = count + 1 totalexceptions = totalexceptions + exceptioncount if exceptioncount > settings.maxErrors: LogE("Too much missing info, skipping this discount", "{0} Errors occured".format(exceptioncount)) failedcount = failedcount + 1 else: db.insert(superdata) LogD("[{0}] ({1}) Fetched '{2}'".format(exceptioncount, count, superdata['productname'])) if failedcount > settings.maxFailedDiscounts: LogE( "Skipping this supermarket, too much missing info.", "More than {0} discounts missing too much info".format( settings.maxFailedDiscounts)) LogI("Skipping this supermarket, too much missing info") LogI("More than {0} discounts missing too much info".format( settings.maxFailedDiscounts)) return seconds = (time.time() * 1000) - start_time LogI( "Done fetching {0} AH discounts in {1}ms. {2} errors occured and ignored.\n" .format(count, format(seconds, '.2f'), totalexceptions))
def fetch(): LogI("Fetching Jan Linders discounts...") start_time = time.time() * 1000 root_url = 'http://www.janlinders.nl' index_url = root_url + '/acties/weekacties/' count = 0 failedcount = 0 totalexceptions = 0 try: response = requests.get(index_url, headers=settings.headers) except requests.exceptions.ConnectionError as ce: LogE("Failed to connect to '{0}'".format(index_url), "{0}".format(ce)) return try: soup = bs4.BeautifulSoup(response.text, 'html5lib') soup.encode('utf-8') except: LogE("Unable to parse HTML", "{0}".format(sys.exc_info()[0])) return category_divs = soup.find_all('div', class_=re.compile('dots_\d+')) for div in category_divs: div_items = div.findAll('div', {'class': 'hover_discount_product'}) for actdiv in div_items: exceptioncount = 0 temp_data = {} temp_data = models.defaultModel.copy() temp_data['supermarket'] = 'janlinders' temp_data['url'] = index_url if actdiv.select('div.action b'): temp_data['productname'] = actdiv.select( 'div.action b')[0].get_text() + " " + actdiv.select( 'div.action h4')[0].get_text().replace('\n', ' ') else: try: temp_data['productname'] = actdiv.select( 'div.action h4')[0].get_text().replace('\n', ' ') except IndexError as e: LogE("[IGNORING] Productname not found", "{0}".format(e)) exceptioncount = exceptioncount + 1 pass try: temp_data['duration'] = soup.select( 'div.date-small')[0].get_text() except IndexError as e: LogE("[IGNORING] Duration not found", "{0}".format(e)) exceptioncount = exceptioncount + 1 pass try: tempOldPrice = actdiv.select('.oldprice')[0].get_text() if tempOldPrice is not None and tempOldPrice != '' and tempOldPrice is not '': temp_data['old_price'] = tempOldPrice except IndexError as e: LogE("[IGNORING] Old price not found", "{0}".format(e)) exceptioncount = exceptioncount + 1 pass try: tempamount = actdiv.find('div', {'class': re.compile("^description")}) for e in tempamount.findAll('h4'): e.extract() for e in tempamount.findAll('b'): e.extract() for e in tempamount.findAll('span'): e.extract() for e in tempamount.findAll('div'): e.extract() temp_data['amount'] = tempamount.get_text().replace('\n', ' ') try: temp_data['amount'] += ". " + actdiv.select( 'div.x_price_w_amount span.small')[0].get_text() except: pass except: LogE("[IGNORING] Amount not found", "{0}".format(sys.exc_info()[0])) exceptioncount = exceptioncount + 1 pass try: temp_data['image'] = root_url + actdiv.find('img').get('src') except: LogE("[IGNORING] Image not found", "{0}".format(sys.exc_info()[0])) exceptioncount = exceptioncount + 1 pass if actdiv.select( 'div.action div.regular_price span.big') and actdiv.select( 'div.action div.regular_price span.small'): try: temp_data['action_price'] = actdiv.select( 'div.action div.regular_price span.big' )[0].get_text() + "." + actdiv.select( 'div.action div.regular_price span.small')[0].get_text( ) except: LogE("[IGNORING] Action price not found", "None") exceptioncount = exceptioncount + 1 pass elif actdiv.select( 'div.x_price_w_amount div.small') and actdiv.select( 'div.x_price_w_amount div.small') and actdiv.select( 'div.x_price_w_amount div.big'): try: temp_data['action_price'] = actdiv.select( 'div.x_price_w_amount div.small' )[0].get_text() + " " + actdiv.select( 'div.x_price_w_amount div.big')[0].get_text( ) + "." + actdiv.select( 'div.x_price_w_amount div.small')[1].get_text() except: LogE("[IGNORING] Action price not found", "None") exceptioncount = exceptioncount + 1 pass elif actdiv.select('div.x_free div.big') and actdiv.select( 'div.x_free div.small'): try: temp_data['action_price'] = ' '.join( actdiv.select('div.x_free div.big') [0].get_text().strip().split()) + " " + ' '.join( actdiv.select('div.x_free div.small') [0].get_text().strip().split()) except: LogE("[IGNORING] Action price not found", "None") exceptioncount = exceptioncount + 1 pass elif actdiv.select('div.regular_price div.big') and actdiv.select( 'div.regular_price div.small'): try: temp_data['action_price'] = actdiv.select( 'div.regular_price div.big')[0].get_text( ) + "." + actdiv.select( 'div.regular_price div.small')[0].get_text() except: LogE("[IGNORING] Action price not found", "None") exceptioncount = exceptioncount + 1 pass #elif actdiv.select('div.price div.big') and actdiv.select('div.price div.small'): #temp_data['action_price'] = actdiv.select('div.price div.big')[0].get_text() + "." + actdiv.select('div.price div.small')[0].get_text() elif actdiv.select( 'div.action div.x_discount div.big') and actdiv.select( 'div.action div.x_discount div.small'): try: temp_data['action_price'] = ' '.join( actdiv.select('div.action div.x_discount div.big') [0].get_text().strip().split()) + " " + actdiv.select( 'div.action div.x_discount div.small')[0].get_text( ) except: LogE("[IGNORING] Action price not found", "None") exceptioncount = exceptioncount + 1 pass elif actdiv.select('div.x_get_for div.small'): try: temp_data['action_price'] = ' '.join( actdiv.select('div.x_get_for div.small') [0].get_text().strip().split()) + ", " + actdiv.select( 'div.x_get_for div.small')[1].get_text().strip() except: LogE("[IGNORING] Action price not found", "None") exceptioncount = exceptioncount + 1 pass elif actdiv.select('div.x_half_price div.small'): try: temp_data['action_price'] = ' '.join( actdiv.select('div.x_half_price div.small') [0].get_text().strip().split()) + " " + actdiv.select( 'div.x_half_price div.small')[1].get_text().strip( ) except: LogE("[IGNORING] Action price not found", "None") exceptioncount = exceptioncount + 1 pass else: LogE("[IGNORING] Action price not found", "None") exceptioncount = exceptioncount + 1 totalexceptions = totalexceptions + exceptioncount count = count + 1 if exceptioncount > settings.maxErrors: LogE("Too much missing info, skipping this discount", "{0} Errors occured".format(exceptioncount)) failedcount = failedcount + 1 else: db.insert(temp_data) LogD("[{0}] ({1}) Fetched '{2}'".format( exceptioncount, count, temp_data['productname'])) if failedcount > settings.maxFailedDiscounts: LogE( "Skipping this supermarket, too much missing info.", "More than {0} discounts missing too much info".format( settings.maxFailedDiscounts)) LogI("Skipping this supermarket, too much missing info") LogI("More than {0} discounts missing too much info".format( settings.maxFailedDiscounts)) return seconds = (time.time() * 1000) - start_time LogI( "Done fetching {0} Jan Linders discounts in {1}ms. {2} errors occured and ignored.\n" .format(count, format(seconds, '.2f'), totalexceptions))
def get_actie_data(actie_page_url): global count global failedcount global totalexceptions exceptioncount = 0 actie_data = {} actie_data = models.defaultModel.copy() actie_data['supermarket'] = 'dirk' url = root_url + actie_page_url try: response = requests.get(url, headers=settings.headers) except requests.exceptions.ConnectionError as ce: LogE("Failed to connect to '{0}'".format(index_url),"{0}".format(ce)) return try: soup = bs4.BeautifulSoup(response.text) soup.encode('utf-8') except: LogE("Unable to parse HTML","{0}".format(sys.exc_info()[0])) return actie_data['url'] = root_url + actie_page_url try: actie_data['productname'] = soup.find('h2').get_text() except: LogE("[IGNORING] Productname not found","{0}".format(sys.exc_info()[0])) exceptioncount = exceptioncount + 1 pass try: actie_data['duration'] = soup.select('div.fromTill')[0].get_text().strip() except IndexError as e: LogE("[IGNORING] Duration not found","{0}".format(e)) exceptioncount = exceptioncount + 1 pass try: amount = soup.select('div.subtitle')[0].get_text().strip() if(amount != '' and amount != ' ' and amount is not None): actie_data['amount'] = amount except IndexError as e: LogE("[IGNORING] Amount not found","{0}".format(e)) exceptioncount = exceptioncount + 1 pass try: div_style = soup.find('div', {'class':'image'})['style'] style = cssutils.parseStyle(div_style) url = style['background-image'] url = url.replace('url(', '').replace(')', '') actie_data['image'] = root_url + url except: LogE("[IGNORING] Image not found","{0}".format(sys.exc_info()[0])) exceptioncount = exceptioncount + 1 pass try: actie_data['action_price'] = soup.select('div.star')[0].get('title').strip().replace(u"\u20AC ","").replace(",",".") except IndexError as e: LogE("[IGNORING] Action price not found","{0}".format(e)) exceptioncount = exceptioncount + 1 pass try: actie_data['old_price'] = soup.select('span.stripe')[0].get_text() except IndexError as e: LogE("[IGNORING] Old price not found","{0}".format(e)) exceptioncount = exceptioncount + 1 pass totalexceptions = totalexceptions + exceptioncount count = count + 1 if exceptioncount > settings.maxErrors: LogE("Too much missing info, skipping this discount","{0} Errors occured".format(exceptioncount)) failedcount = failedcount + 1 else: db.insert(actie_data) LogD("[{0}] ({1}) Fetched '{2}'".format(exceptioncount, count, actie_data['productname'])) if failedcount > settings.maxFailedDiscounts: LogE("Skipping this supermarket, too much missing info.","More than {0} discounts missing too much info".format(settings.maxFailedDiscounts)) LogI("Skipping this supermarket, too much missing info") LogI("More than {0} discounts missing too much info".format(settings.maxFailedDiscounts)) return
def fetch(): LogI("Fetching Aldi discounts...") start_time = time.time() * 1000 index_url = 'http://www.aldi.nl/' try: r = requests.get(index_url, headers=settings.headers) except requests.exceptions.ConnectionError as ce: LogE("Failed to connect to '{0}'".format(index_url),"{0}".format(ce)) return try: soup = bs4.BeautifulSoup(r.text, 'html5lib') soup.encode('utf-8') except: LogE("Unable to parse HTML","{0}".format(sys.exc_info()[0])) return count = 0 failedcount = 0 totalexceptions = 0 pages = soup.select('ul#ul_menu_142002 li')[0] pages = pages.select('ul li a') for page in pages: try: r = requests.get(index_url + page.get('href'), headers=settings.headers) except requests.exceptions.ConnectionError as ce: LogE("Failed to connect to '{0}'".format(index_url),"{0}".format(ce)) return try: soup = bs4.BeautifulSoup(r.text, 'html5lib') soup.encode('utf-8') except: LogE("Unable to parse HTML","{0}".format(sys.exc_info()[0])) return duration = "n/a" try: duration = "Vanaf " + soup.select('li.active h2.tab-headline span')[0].get_text() except IndexError as e: LogE("[IGNORING] Duration not found","{0}".format(e)) totalexceptions = totalexceptions + 1 pass discounts = soup.select('div.product-tile') for discount in discounts: exceptioncount = 0 superdata = {} superdata = models.defaultModel.copy() superdata['supermarket'] = 'aldi' # URL try: superdata['url'] = index_url + discount.select('a')[1].get('href') except (IndexError, TypeError) as e: if superdata['url'] is None: LogE("[IGNORING] Error","{0}".format(e)) exceptioncount = exceptioncount + 1 pass # PRODUCTNAME try: superdata['productname'] = discount.select('h3')[0].get_text().strip() except IndexError as e: LogE("[IGNORING] Productname not found","{0}".format(e)) exceptioncount = exceptioncount + 1 pass # DURATION try: #superdata['duration'] = soup.select('div.columns p.header-bar__term')[0].get_text() superdata['duration'] = duration except IndexError as e: LogE("[IGNORING] Duration not found","{0}".format(e)) exceptioncount = exceptioncount + 1 pass # IMAGE try: superdata['image'] = index_url + discount.select('img')[0].get('src') except IndexError as e: LogE("[IGNORING] Image not found","{0}".format(e)) exceptioncount = exceptioncount + 1 pass # AMOUNT try: tempAmount = discount.select('div.unit')[0].get_text().strip() if tempAmount is not None: superdata['amount'] = tempAmount except IndexError as e: LogE("[IGNORING] Amount not found","{0}".format(e)) exceptioncount = exceptioncount + 1 pass # ACTION PRICE try: superdata['action_price'] = discount.select('strong')[0].get_text().replace('*','') except IndexError as e: LogE("[IGNORING] Action price not found","{0}".format(e)) exceptioncount = exceptioncount + 1 pass # DESCRIPTION try: superdata['description'] = discount.select('div.richtext')[0].get_text().strip() except IndexError as e: LogE("[IGNORING] Description not found","{0}".format(e)) exceptioncount = exceptioncount + 1 pass # OLD PRICE try: superdata['old_price'] = "n/a" except IndexError as e: LogE("[IGNORING] Old price not found","{0}".format(e)) exceptioncount = exceptioncount + 1 pass count = count + 1 totalexceptions = totalexceptions + exceptioncount if exceptioncount > settings.maxErrors: LogE("Too much missing info, skipping this discount","{0} Errors occured".format(exceptioncount)) failedcount = failedcount + 1 else: db.insert(superdata) LogD("[{0}] ({1}) Fetched '{2}'".format(exceptioncount, count, superdata['productname'])) if failedcount > settings.maxFailedDiscounts: LogE("Skipping this supermarket, too much missing info.","More than {0} discounts missing too much info".format(settings.maxFailedDiscounts)) LogI("Skipping this supermarket, too much missing info") LogI("More than {0} discounts missing too much info".format(settings.maxFailedDiscounts)) return seconds = (time.time() * 1000) - start_time LogI("Done fetching {0} Aldi discounts in {1}ms. {2} errors occured and ignored.\n".format(count, format(seconds, '.2f'), totalexceptions))
def get_actie_data(actie_page_url): try: response = requests.get(actie_page_url, headers=settings.headers) except requests.exceptions.ConnectionError as ce: LogE("Failed to connect to '{0}'".format(index_url),"{0}".format(ce)) return try: soup = bs4.BeautifulSoup(response.text) soup.encode('utf-8') except: LogE("Unable to parse HTML","{0}".format(sys.exc_info()[0])) return global count global failedcount global totalexceptions global duration category_divs = soup.findAll('div', {'class':'aanbieding'}) for div in category_divs: exceptioncount = 0 temp_data = {} temp_data = models.defaultModel.copy() temp_data['supermarket'] = 'deka' temp_data['url'] = actie_page_url try: temp_data['productname'] = div.find('h2').get_text() except: LogE("[IGNORING] Productname not found","{0}".format(sys.exc_info()[0])) exceptioncount = exceptioncount + 1 pass try: temp_data['duration'] = '' except: LogE("[IGNORING] Duration not found","{0}".format(sys.exc_info()[0])) exceptioncount = exceptioncount + 1 pass try: temp_data['description'] = div.select('div.text')[0].get_text() except IndexError as e: LogE("[IGNORING] Description not found","{0}".format(e)) exceptioncount = exceptioncount + 1 pass try: temp_data['image'] = root_url + div.find('img').get('src') except: LogE("[IGNORING] Image not found","{0}".format(sys.exc_info()[0])) exceptioncount = exceptioncount + 1 pass try: temp_data['amount'] = getAmount(div.find('div', {'class' : re.compile("tag")}).get('class')[1].replace('tag', '')) except: LogE("[IGNORING] Amount not found","{0}".format(sys.exc_info()[0])) exceptioncount = exceptioncount + 1 pass try: temp_data['action_price'] = div.select('span.current span.whole')[0].get_text() + div.select('span.current span.part')[0].get_text() except IndexError as e: LogE("[IGNORING] Action price not found","{0}".format(e)) exceptioncount = exceptioncount + 1 pass try: temp_data['old_price'] = div.select('span.old span.whole')[0].get_text() + div.select('span.old span.part')[0].get_text() except IndexError as e: LogE("[IGNORING] Old price not found","{0}".format(e)) exceptioncount = exceptioncount + 1 pass count = count + 1 totalexceptions = totalexceptions + exceptioncount if exceptioncount > settings.maxErrors: LogE("Too much missing info, skipping this discount","{0} Errors occured".format(exceptioncount)) failedcount = failedcount + 1 else: db.insert(temp_data) LogD("[{0}] ({1}) Fetched '{2}'".format(exceptioncount, count, temp_data['productname'])) if failedcount > settings.maxFailedDiscounts: LogE("Skipping this supermarket, too much missing info.","More than {0} discounts missing too much info".format(settings.maxFailedDiscounts)) LogI("Skipping this supermarket, too much missing info") LogI("More than {0} discounts missing too much info".format(settings.maxFailedDiscounts)) return
def fetch(): LogI("Fetching Jan Linders discounts...") start_time = time.time() * 1000 root_url = 'http://www.janlinders.nl' index_url = root_url + '/acties/weekacties/' count = 0 failedcount = 0 totalexceptions = 0 try: response = requests.get(index_url, headers=settings.headers) except requests.exceptions.ConnectionError as ce: LogE("Failed to connect to '{0}'".format(index_url),"{0}".format(ce)) return try: soup = bs4.BeautifulSoup(response.text, 'html5lib') soup.encode('utf-8') except: LogE("Unable to parse HTML","{0}".format(sys.exc_info()[0])) return category_divs = soup.find_all('div', class_=re.compile('dots_\d+')) for div in category_divs: div_items = div.findAll('div', { 'class' : 'hover_discount_product'}) for actdiv in div_items: exceptioncount = 0 temp_data = {} temp_data = models.defaultModel.copy() temp_data['supermarket'] = 'janlinders' temp_data['url'] = index_url if actdiv.select('div.action b'): temp_data['productname'] = actdiv.select('div.action b')[0].get_text() + " " + actdiv.select('div.action h4')[0].get_text().replace('\n' , ' ') else: try: temp_data['productname'] = actdiv.select('div.action h4')[0].get_text().replace('\n' , ' ') except IndexError as e: LogE("[IGNORING] Productname not found","{0}".format(e)) exceptioncount = exceptioncount + 1 pass try: temp_data['duration'] = soup.select('div.date-small')[0].get_text() except IndexError as e: LogE("[IGNORING] Duration not found","{0}".format(e)) exceptioncount = exceptioncount + 1 pass try: tempOldPrice = actdiv.select('.oldprice')[0].get_text() if tempOldPrice is not None and tempOldPrice != '' and tempOldPrice is not '': temp_data['old_price'] = tempOldPrice except IndexError as e: LogE("[IGNORING] Old price not found","{0}".format(e)) exceptioncount = exceptioncount + 1 pass try: tempamount = actdiv.find('div', { 'class' : re.compile("^description")}) for e in tempamount.findAll('h4'): e.extract() for e in tempamount.findAll('b'): e.extract() for e in tempamount.findAll('span'): e.extract() for e in tempamount.findAll('div'): e.extract() temp_data['amount'] = tempamount.get_text().replace('\n' , ' ') try: temp_data['amount'] += ". " + actdiv.select('div.x_price_w_amount span.small')[0].get_text() except: pass except: LogE("[IGNORING] Amount not found","{0}".format(sys.exc_info()[0])) exceptioncount = exceptioncount + 1 pass try: temp_data['image'] = root_url + actdiv.find('img').get('src') except: LogE("[IGNORING] Image not found","{0}".format(sys.exc_info()[0])) exceptioncount = exceptioncount + 1 pass if actdiv.select('div.action div.regular_price span.big') and actdiv.select('div.action div.regular_price span.small'): try: temp_data['action_price'] = actdiv.select('div.action div.regular_price span.big')[0].get_text() + "." + actdiv.select('div.action div.regular_price span.small')[0].get_text() except: LogE("[IGNORING] Action price not found","None") exceptioncount = exceptioncount + 1 pass elif actdiv.select('div.x_price_w_amount div.small') and actdiv.select('div.x_price_w_amount div.small') and actdiv.select('div.x_price_w_amount div.big'): try: temp_data['action_price'] = actdiv.select('div.x_price_w_amount div.small')[0].get_text() + " " + actdiv.select('div.x_price_w_amount div.big')[0].get_text() + "." + actdiv.select('div.x_price_w_amount div.small')[1].get_text() except: LogE("[IGNORING] Action price not found","None") exceptioncount = exceptioncount + 1 pass elif actdiv.select('div.x_free div.big') and actdiv.select('div.x_free div.small'): try: temp_data['action_price'] = ' '.join(actdiv.select('div.x_free div.big')[0].get_text().strip().split()) + " " + ' '.join(actdiv.select('div.x_free div.small')[0].get_text().strip().split()) except: LogE("[IGNORING] Action price not found","None") exceptioncount = exceptioncount + 1 pass elif actdiv.select('div.regular_price div.big') and actdiv.select('div.regular_price div.small'): try: temp_data['action_price'] = actdiv.select('div.regular_price div.big')[0].get_text() + "." + actdiv.select('div.regular_price div.small')[0].get_text() except: LogE("[IGNORING] Action price not found","None") exceptioncount = exceptioncount + 1 pass #elif actdiv.select('div.price div.big') and actdiv.select('div.price div.small'): #temp_data['action_price'] = actdiv.select('div.price div.big')[0].get_text() + "." + actdiv.select('div.price div.small')[0].get_text() elif actdiv.select('div.action div.x_discount div.big') and actdiv.select('div.action div.x_discount div.small'): try: temp_data['action_price'] = ' '.join(actdiv.select('div.action div.x_discount div.big')[0].get_text().strip().split()) + " " + actdiv.select('div.action div.x_discount div.small')[0].get_text() except: LogE("[IGNORING] Action price not found","None") exceptioncount = exceptioncount + 1 pass elif actdiv.select('div.x_get_for div.small'): try: temp_data['action_price'] = ' '.join(actdiv.select('div.x_get_for div.small')[0].get_text().strip().split()) + ", " + actdiv.select('div.x_get_for div.small')[1].get_text().strip() except: LogE("[IGNORING] Action price not found","None") exceptioncount = exceptioncount + 1 pass elif actdiv.select('div.x_half_price div.small'): try: temp_data['action_price'] = ' '.join(actdiv.select('div.x_half_price div.small')[0].get_text().strip().split()) + " " + actdiv.select('div.x_half_price div.small')[1].get_text().strip() except: LogE("[IGNORING] Action price not found","None") exceptioncount = exceptioncount + 1 pass else: LogE("[IGNORING] Action price not found","None") exceptioncount = exceptioncount + 1 totalexceptions = totalexceptions + exceptioncount count = count + 1 if exceptioncount > settings.maxErrors: LogE("Too much missing info, skipping this discount","{0} Errors occured".format(exceptioncount)) failedcount = failedcount + 1 else: db.insert(temp_data) LogD("[{0}] ({1}) Fetched '{2}'".format(exceptioncount, count, temp_data['productname'])) if failedcount > settings.maxFailedDiscounts: LogE("Skipping this supermarket, too much missing info.","More than {0} discounts missing too much info".format(settings.maxFailedDiscounts)) LogI("Skipping this supermarket, too much missing info") LogI("More than {0} discounts missing too much info".format(settings.maxFailedDiscounts)) return seconds = (time.time() * 1000) - start_time LogI("Done fetching {0} Jan Linders discounts in {1}ms. {2} errors occured and ignored.\n".format(count, format(seconds, '.2f'), totalexceptions))
def get_actie_data(actie_page_url): global count global failedcount global totalexceptions exceptioncount = 0 actie_data = {} actie_data = models.defaultModel.copy() actie_data['supermarket'] = 'dirk' url = root_url + actie_page_url try: response = requests.get(url, headers=settings.headers) except requests.exceptions.ConnectionError as ce: LogE("Failed to connect to '{0}'".format(index_url), "{0}".format(ce)) return try: soup = bs4.BeautifulSoup(response.text) soup.encode('utf-8') except: LogE("Unable to parse HTML", "{0}".format(sys.exc_info()[0])) return actie_data['url'] = root_url + actie_page_url try: actie_data['productname'] = soup.find('h2').get_text() except: LogE("[IGNORING] Productname not found", "{0}".format(sys.exc_info()[0])) exceptioncount = exceptioncount + 1 pass try: actie_data['duration'] = soup.select( 'div.fromTill')[0].get_text().strip() except IndexError as e: LogE("[IGNORING] Duration not found", "{0}".format(e)) exceptioncount = exceptioncount + 1 pass try: amount = soup.select('div.subtitle')[0].get_text().strip() if (amount != '' and amount != ' ' and amount is not None): actie_data['amount'] = amount except IndexError as e: LogE("[IGNORING] Amount not found", "{0}".format(e)) exceptioncount = exceptioncount + 1 pass try: div_style = soup.find('div', {'class': 'image'})['style'] style = cssutils.parseStyle(div_style) url = style['background-image'] url = url.replace('url(', '').replace(')', '') actie_data['image'] = root_url + url except: LogE("[IGNORING] Image not found", "{0}".format(sys.exc_info()[0])) exceptioncount = exceptioncount + 1 pass try: actie_data['action_price'] = soup.select('div.star')[0].get( 'title').strip().replace(u"\u20AC ", "").replace(",", ".") except IndexError as e: LogE("[IGNORING] Action price not found", "{0}".format(e)) exceptioncount = exceptioncount + 1 pass try: actie_data['old_price'] = soup.select('span.stripe')[0].get_text() except IndexError as e: LogE("[IGNORING] Old price not found", "{0}".format(e)) exceptioncount = exceptioncount + 1 pass totalexceptions = totalexceptions + exceptioncount count = count + 1 if exceptioncount > settings.maxErrors: LogE("Too much missing info, skipping this discount", "{0} Errors occured".format(exceptioncount)) failedcount = failedcount + 1 else: db.insert(actie_data) LogD("[{0}] ({1}) Fetched '{2}'".format(exceptioncount, count, actie_data['productname'])) if failedcount > settings.maxFailedDiscounts: LogE( "Skipping this supermarket, too much missing info.", "More than {0} discounts missing too much info".format( settings.maxFailedDiscounts)) LogI("Skipping this supermarket, too much missing info") LogI("More than {0} discounts missing too much info".format( settings.maxFailedDiscounts)) return
def get_discount_data(actie_page_url): global duration global count global totalexceptions global failedcount try: response = requests.get(actie_page_url, headers=settings.headers) except requests.exceptions.ConnectionError as ce: LogE("Failed to connect to '{0}'".format(index_url),"{0}".format(ce)) return try: soup = bs4.BeautifulSoup(response.text) soup.encode('utf-8') except: LogE("Unable to parse HTML","{0}".format(sys.exc_info()[0])) return output = [] for discount in soup.findAll('li', {'class': 'jum-result'}): exceptioncount = 0 discount_data = {} discount_data = models.defaultModel.copy() discount_data['supermarket'] = 'jumbo' discount_data['url'] = index_url try: discount_data['productname'] = discount.find('h3').get_text() except: LogE("[IGNORING] Image not found","{0}".format(sys.exc_info()[0])) exceptioncount = exceptioncount + 1 pass discount_data['duration'] = duration try: discount_data['amount'] = discount.select('dd.jum-promotion-text-field')[0].get_text() except IndexError as e: LogE("[IGNORING] Amount not found","{0}".format(e)) exceptioncount = exceptioncount + 1 pass try: discount_data['image'] = 'http://www.jumbo.com' + discount.select('dd.jum-item-figure img')[0].get('src') except IndexError as e: LogE("[IGNORING] Image not found","{0}".format(e)) exceptioncount = exceptioncount + 1 pass try: discount_data['action_price'] = discount.find(text=re.compile('Actieprijs')).replace("Actieprijs ","") except AttributeError as e: LogE("[IGNORING] Action price not found","{0}".format(e)) exceptioncount = exceptioncount + 1 pass try: discount_data['old_price'] = discount.find(text=re.compile('Normale prijs')).replace("Normale prijs ","") except AttributeError as e: LogE("[IGNORING] Old price not found","{0}".format(e)) exceptioncount = exceptioncount + 1 pass count = count + 1 totalexceptions = totalexceptions + exceptioncount if exceptioncount > settings.maxErrors: LogE("Too much missing info, skipping this discount","{0} Errors occured".format(exceptioncount)) failedcount = failedcount + 1 else: db.insert(discount_data) LogD("[{0}] ({1}) Fetched '{2}'".format(exceptioncount, count, discount_data['productname'])) if failedcount > settings.maxFailedDiscounts: LogE("Skipping this supermarket, too much missing info.","More than {0} discounts missing too much info".format(settings.maxFailedDiscounts)) LogI("Skipping this supermarket, too much missing info") LogI("More than {0} discounts missing too much info".format(settings.maxFailedDiscounts)) return return output