def get_deal(self, url): """ All deals use the same layout. Groupon uses different URLs poiting to the same deal. The URL format defines the HTMLformat displayed. """ deal = {} tree = self.parse(url=url) if tree._root is not None and tree._code == 200: try: tag = tree._root.xpath('//input[@id="currentTimeLeft"]').pop() deal['status'] = 1 except IndexError: # Expired / Sold Out deal['status'] = 0 else: info = self.urlinfo(url) try: tag = tree._root.xpath('//div[@class="merchantContact"]').pop() except IndexError: raise ElementMissing('{:s}:://div[@class="merchantContact"]'.format(url)) else: try: deal['merchant'] = utils.get_text(tag.xpath('//h2[@class="subHeadline"]')[0]) except IndexError: raise ElementMissing('{:s}:merchant://h2[@class="subHeadline"]'.format(url)) else: try: deal['merchant_url'] = tag.xpath('a').pop().get('href') except IndexError: pass address = self.__extract_address_lines(tag, deal, info) if address: deal['addresses'] = [address] try: tag = tree._root.xpath('//div[@id="contentDealBuyBox"]/span[@class="price"]/span[@class="noWrap"]').pop() except IndexError: raise ElementMissing('{:s}:price://div[@id="contentDealBuyBox"]/span[@class="price"]/span[@class="noWrap"]'.format(url)) else: price = utils.extract_float_from_tag(tag) deal['price'] = price try: tag = tree._root.xpath('//div[@id="contentDealBuyBox"]/div[contains(@class, "savings")]/*[contains(@class, "_saving")]')[0] _savings = utils.extract_float_from_tag(tag) except IndexError: _savings = 0.0 deal['rrp'] = price + _savings try: tag = tree._root.xpath('//span[@id="jDealSoldAmount"]').pop() except IndexError: raise ElementMissing('{:s}:sales://span[@id="jDealSoldAmount"]'.format(url)) else: deal['volume'] = int(utils.extract_float_from_tag(tag)) return deal
def get_deals(self, url): """ The Groupon Europe RSS feed lists all deals. Some RSS items include more than one offering, which are represented as multiple deals. """ deals = [] tree = self.parse(url=url) if tree._root is None: return [] # The parsing failed (usually caused by connection/network issues) if tree._ptype != 'XML': return [] # The Groupon parser requires an XML-formatted URL if tree._code == 200: info = self.urlinfo(url) for item in tree._root.xpath('/rss/channel/item'): title = utils.get_text(item.xpath('title')[0]) pubDate = utils.get_text(item.xpath('pubDate')[0])[:-4] # Remove ' GMT' pubDate = datetime.datetime.strptime(pubDate, '%a, %d %b %Y %X') link = utils.strip_qs(utils.get_text(item.xpath('link')[0])) link_base = "/".join(link.split('/')[:-1]) description = etree.HTML(item.xpath('description')[0].text) if len(description.xpath('//ul/a')) > 1: for itm in description.xpath('//ul'): deal = {} deal['title'] = utils.get_text(itm.xpath('a')[0]) deal['headline'] = utils.get_text(itm.xpath('br')[0], with_tail=True) deal['link'] = utils.strip_qs(itm.xpath('a')[0].get('href')) info = self.urlinfo(deal['link']) deal['rel_id'] = info['rel_id'] deal['pubDate'] = pubDate deal['site'] = info['site'] deal['locale'] = info['locale'] deal['location'] = info['location'] deal['category'] = info['category'] hashbag = [ deal['title'], deal['headline'], deal['pubDate'], deal['rel_id'], deal['site'], deal['locale'], deal['location'], deal['category'] ] deal['hashid'] = self.get_hash(hashbag) deals.append(deal) else: deal = {} deal['title'] = title deal['headline'] = utils.get_text(description) deal['link'] = link info = self.urlinfo(link) deal['rel_id'] = info['rel_id'] deal['pubDate'] = pubDate deal['site'] = info['site'] deal['locale'] = info['locale'] deal['location'] = info['location'] deal['category'] = info['category'] hashbag = [ deal['title'], deal['headline'], deal['pubDate'], deal['rel_id'], deal['site'], deal['locale'], deal['location'], deal['category'] ] deal['hashid'] = self.get_hash(hashbag) deals.append(deal) return deals