def parse_archive_recipes(self, response, scraped_recipes): hxs = HtmlXPathSelector(response) for i, title_node in enumerate(hxs.select('//u[b][not(parent::div)] | //div[u[b]]')): title = html_to_text(title_node.extract()).strip().strip('.').title() if title.lower() in scraped_recipes: continue ingredients = [] for line in split_at_br(title_node.select('./following-sibling::node()[not(preceding::u[b][%d])]' % (i + 2)), include_blank=True, newline_elements=['br', 'div', 'b']) + ['']: line = html_to_text(line).strip() if not line: if len(ingredients) == 1: ingredients = [] if ingredients: break continue ingredients.append(line) if not ingredients: continue yield CocktailItem( title=title, picture=None, url=response.url, source="Dr. Adam Elmegirab's", ingredients=ingredients )
def parse(self, response): hxs = HtmlXPathSelector(response) for title in hxs.select("//strong[normalize-space(text()) != '']"): lines = split_at_br(title.select( "ancestor-or-self::node()/following-sibling::node()[not(self::span[starts-with(text(), 'Stir')])]" ), include_blank=True) ingredients = [] for line in lines[1 + (not lines[1][:1].isdigit()):]: line = html_to_text(line).strip() if not line: break if re.search( r'\b(?:shaken?|stir(?:red)?|fill glass|preparation)\b', line, re.I): break ingredients.append(line) yield CocktailItem(title=html_to_text( title.extract()).strip().rstrip('*').title(), picture=None, url=response.url, source="Dale DeGroff's", ingredients=ingredients)
def parse(self, response): hxs = HtmlXPathSelector(response) for title in hxs.select("//strong[normalize-space(text()) != '']"): lines = split_at_br(title.select("ancestor-or-self::node()/following-sibling::node()[not(self::span[starts-with(text(), 'Stir')])]"), include_blank=True) ingredients = [] for line in lines[1 + (not lines[1][:1].isdigit()):]: line = html_to_text(line).strip() if not line: break if re.search(r'\b(?:shaken?|stir(?:red)?|fill glass|preparation)\b', line, re.I): break ingredients.append(line) yield CocktailItem( title=html_to_text(title.extract()).strip().rstrip('*').title(), picture=None, url=response.url, source="Dale DeGroff's", ingredients=ingredients )
def parse_recipes(self, response): hxs = HtmlXPathSelector(response) for url in hxs.select("//link[@rel='canonical']/@href").extract(): url = urljoin(response.url, url) if url != response.url: yield Request(url, callback=self.parse_recipes) raise StopIteration for recipe in hxs.select(xp_recipes): for title in recipe.select('caption').extract(): break else: continue ingredients = recipe.select(xp_ingredients).extract() if not ingredients: continue for picture in recipe.select("tr/td[@colspan='2']//img/@src | preceding-sibling::*[contains(concat(' ', normalize-space(@class), ' '), ' thumb ')]//img/@src").extract(): picture = urljoin(response.url, picture) break else: picture = None yield CocktailItem( title=html_to_text(title), picture=picture, url=response.url, source='Wikipedia', ingredients=[html_to_text(x) for x in ingredients] )
def parse_recipe(self, response): hxs = HtmlXPathSelector(response) ingredients = [ html_to_text(s).split('-', 1)[1].strip() for s in hxs.select(xp_ingredients).extract() ] if not ingredients: return [] for title in hxs.select(xp_header).extract(): break else: return [] for picture in hxs.select(xp_picture).extract(): picture = urljoin(response.url, picture) else: picture = None return [ CocktailItem( title=html_to_text(title), picture=picture, url=response.url, source='Cocktail Times', ingredients=ingredients, ) ]
def parse_recipe(self, response, num_recipes, scraped_recipes): hxs = HtmlXPathSelector(response) ingredients = [] for paragraph in hxs.select('//p'): l = [] for line in split_at_br(paragraph, include_blank=True) + ['']: line = html_to_text(line).strip() if line: l.append(line) continue if len(l) >= len(ingredients): ingredients = l paragraph_with_ingredients = paragraph l = [] title = hxs.select("//text()[contains(self::text(), ' such as the ')]") if title: title = html_to_text(title[0].extract()) title = re.search(r'(?<= such as the ).+?(?=,|;| created )', title).group(0) else: title = paragraph_with_ingredients.select('./preceding-sibling::p')[-1] title = html_to_text(title.extract()).rstrip(';') yield CocktailItem( title=title, picture=None, url=response.url, source="Dr. Adam Elmegirab's", ingredients=ingredients ) scraped_recipes.add(title.lower()) if len(scraped_recipes) == num_recipes: yield Request( urljoin( response.url, hxs.select("//a[text() = 'Archives']/@href")[0].extract() ), partial( self.parse_archive, scraped_recipes=scraped_recipes ) )
def extract_extra_ingredients(nodes, is_section_header): section = None sections = OrderedDict() for node in nodes: text = node.extract() if isinstance(node, XPathSelector) else node text = html_to_text(text).strip() if not text: continue if is_section_header(node): section = text continue sections.setdefault(section, []).append(text) if None in sections: ingredients = sections.pop(None) elif sections: ingredients = sections.pop(sections.keys()[-1]) else: ingredients = [] extra_ingredients = [x for y in sections.values() for x in y] return (ingredients, extra_ingredients)
def parse_recipes(self, response): hxs = HtmlXPathSelector(response) for url in hxs.select("//link[@rel='canonical']/@href").extract(): url = urljoin(response.url, url) if url != response.url: yield Request(url, callback=self.parse_recipes) raise StopIteration for recipe in hxs.select(xp_recipes): for title in recipe.select('caption').extract(): break else: continue ingredients = recipe.select(xp_ingredients).extract() if not ingredients: continue for picture in recipe.select( "tr/td[@colspan='2']//img/@src | preceding-sibling::*[contains(concat(' ', normalize-space(@class), ' '), ' thumb ')]//img/@src" ).extract(): picture = urljoin(response.url, picture) break else: picture = None yield CocktailItem(title=html_to_text(title), picture=picture, url=response.url, source='Wikipedia', ingredients=map(html_to_text, ingredients))
def parse_recipe(self, response, title, picture): hxs = HtmlXPathSelector(response) section = None sections = OrderedDict() for node in hxs.select(xp_ingredients): text = html_to_text(node.extract()).strip() if not text: continue if node.select('strong'): section = text continue sections.setdefault(section, []).append(text) ingredients = sections.pop(None, None) or sections.pop(sections.keys()[-1]) extra_ingredients = [x for y in sections.values() for x in y] yield CocktailItem( title=title, picture=picture, url=response.url, ingredients=ingredients, extra_ingredients=extra_ingredients )
def parse_recipe(self, response): hxs = HtmlXPathSelector(response) for title in hxs.select('//h1').extract(): break else: return [] for picture in hxs.select("//img[@itemprop='photo']/@src").extract(): picture = urljoin(response.url, picture) break else: picture = None ingredients = hxs.select("//*[@itemprop='ingredient']").extract() return [ CocktailItem( title=html_to_text(title), picture=picture, url=response.url, source='Liquor.com', ingredients=map(html_to_text, ingredients), ) ]
def parse_recipe(self, response): hxs = HtmlXPathSelector(response) for title in hxs.select('//h2').extract(): break else: return [] ingredients = hxs.select(xp_ingredients).extract() return [CocktailItem( title=html_to_text(title), picture=None, url=response.url, source='CocktailDB', ingredients=[html_to_text(x) for x in ingredients], )]
def parse_recipe(self, response): hxs = HtmlXPathSelector(response) for title in hxs.select(xp_title).extract(): break else: return [] ingredients = hxs.select(xp_ingredients).extract() return [CocktailItem( title=re.sub(r'\s+recipe$', '', html_to_text(title)), picture=None, url=response.url, source='Drinks Mixer', ingredients=[html_to_text(x) for x in ingredients], )]
def parse_recipe(self, response): hxs = HtmlXPathSelector(response) for title in hxs.select(xp_title).extract(): break else: return [] ingredients = [] for ingredient in split_at_br(hxs.select(xp_ingredients)): if not ingredient.endswith(':'): ingredients.append(html_to_text(ingredient)) return [CocktailItem( title=html_to_text(title).split(':')[-1].split('\u2013')[-1].strip(), picture=None, url=response.url, source='Monkey 47 Blog', ingredients=ingredients )]
def parse_recipe(self, response): hxs = HtmlXPathSelector(response) for title in hxs.select(xp_title).extract(): break else: return [] ingredients = [] for ingredient in split_at_br(hxs.select(xp_ingredients)): if not ingredient.endswith(':'): ingredients.append(html_to_text(ingredient)) return [ CocktailItem(title=html_to_text(title).split(':')[-1].split( u'\u2013')[-1].strip(), picture=None, url=response.url, source='Monkey 47 Blog', ingredients=ingredients) ]
def parse_recipes(self, response, recipe_urls): hxs = HtmlXPathSelector(response) for url in recipe_urls: node = hxs.select("//*[@id='%s']" % urlparse(url).fragment)[0] for picture in node.select('./preceding-sibling::*[1]/img/@src').extract(): picture = urljoin(url, picture) break else: picture = None ingredients = node.select('./following-sibling::*[position()<=2]/li').extract() yield CocktailItem( title=html_to_text(node.extract()), picture=picture, url=url, source='Oh Gosh!', ingredients=[html_to_text(x) for x in ingredients], )
def parse_recipe(self, response): hxs = HtmlXPathSelector(response) for title in hxs.select('//h1').extract(): break else: return [] for picture in hxs.select("//img[@itemprop='photo']/@src").extract(): picture = urljoin(response.url, picture) break else: picture = None ingredients, extra_ingredients = extract_extra_ingredients( ( split_at_br(hxs.select( "//node()" "[preceding::h4[" "starts-with(text(),'INGREDIENTS') or " "starts-with(text(),'Ingredients') or " "starts-with(text(),'ingredients')" "]]" "[following::h4[" "starts-with(text(),'INSTRUCTIONS') or " "starts-with(text(),'Instructions') or " "starts-with(text(),'instructions') or" "starts-with(text(),'DIRECTIONS') or " "starts-with(text(),'Directions') or " "starts-with(text(),'directions')" "]]" )) or hxs.select('//div[count(*)=1]/b').extract() or split_at_br(hxs.select('//b//node()')) or hxs.select("//span[@style='font-weight: bold;']").extract() ), lambda s: s.isupper() ) if not ingredients: return [] return [CocktailItem( title=html_to_text(title).strip(), picture=picture, url=response.url, source='Saveur', ingredients=ingredients, extra_ingredients=extra_ingredients )]
def parse_recipe(self, response): hxs = HtmlXPathSelector(response) for title in hxs.select("//*[@itemprop='name']").extract(): break else: return [] for picture in hxs.select("//img[@itemprop='image']/@src").extract(): picture = urljoin(response.url, picture) break else: picture = None ingredients = hxs.select(xp_ingredients).extract() return [CocktailItem( title=html_to_text(title), picture=picture, url=response.url, source='DrinkBoy', ingredients=[html_to_text(x) for x in ingredients], )]
def parse_recipe(self, response): hxs = HtmlXPathSelector(response) ingredients = [html_to_text(s).split('-', 1)[1].strip() for s in hxs.select(xp_ingredients).extract()] if not ingredients: return [] for title in hxs.select(xp_header).extract(): break else: return [] for picture in hxs.select(xp_picture).extract(): picture = urljoin(response.url, picture) else: picture = None return [CocktailItem( title=html_to_text(title), picture=picture, url=response.url, source='Cocktail Times', ingredients=ingredients, )]
def parse_recipe(self, response): hxs = HtmlXPathSelector(response) for title in hxs.select( "//meta[@property='og:title']/@content").extract(): break else: return [] for picture in hxs.select( "//*[@id='drink_infopicvid']/img/@src").extract(): picture = urljoin(response.url, picture) break else: picture = None ingredients = [] for node in hxs.select("//ul[@id='ingredients']/li"): parts = [] for child in node.select('* | text()'): text = html_to_text(child.extract()) if 'ingredient' in (child.xmlNode.prop('class') or '').split(): text = text.split('--')[-1] text = text.strip() if not text: continue parts.append(text) ingredients.append(' '.join(parts)) # don't crawl recipes like 'American Whiskey & Canadian Whisky', # that only consist of pouring a single spirit into a glass. if len(ingredients) <= 1: return [] return [ CocktailItem(title=unescape(title), picture=picture, url=response.url, source='Esquire', ingredients=ingredients) ]
def parse_recipe(self, response): hxs = HtmlXPathSelector(response) for title in hxs.select('//h1').extract(): break else: return [] for picture in hxs.select("//img[@itemprop='photo']/@src").extract(): picture = urljoin(response.url, picture) break else: picture = None ingredients, extra_ingredients = extract_extra_ingredients( (split_at_br( hxs.select("//node()" "[preceding::h4[" "starts-with(text(),'INGREDIENTS') or " "starts-with(text(),'Ingredients') or " "starts-with(text(),'ingredients')" "]]" "[following::h4[" "starts-with(text(),'INSTRUCTIONS') or " "starts-with(text(),'Instructions') or " "starts-with(text(),'instructions') or" "starts-with(text(),'DIRECTIONS') or " "starts-with(text(),'Directions') or " "starts-with(text(),'directions')" "]]")) or hxs.select('//div[count(*)=1]/b').extract() or split_at_br(hxs.select('//b//node()')) or hxs.select("//span[@style='font-weight: bold;']").extract()), lambda s: s.isupper()) if not ingredients: return [] return [ CocktailItem(title=html_to_text(title).strip(), picture=picture, url=response.url, source='Saveur', ingredients=ingredients, extra_ingredients=extra_ingredients) ]
def parse_recipe(self, response): hxs = HtmlXPathSelector(response) for title in hxs.select("//meta[@property='og:title']/@content").extract(): break else: return [] for picture in hxs.select("//*[@id='drink_infopicvid']/img/@src").extract(): picture = urljoin(response.url, picture) break else: picture = None ingredients = [] for node in hxs.select("//ul[@id='ingredients']/li"): parts = [] for child in node.select('* | text()'): text = html_to_text(child.extract()) if 'ingredient' in (child.xmlNode.prop('class') or '').split(): text = text.split('--')[-1] text = text.strip() if not text: continue parts.append(text) ingredients.append(' '.join(parts)) # don't crawl recipes like 'American Whiskey & Canadian Whisky', # that only consist of pouring a single spirit into a glass. if len(ingredients) <= 1: return [] return [CocktailItem( title=unescape(title), picture=picture, url=response.url, source='Esquire', ingredients=ingredients )]
def parse_recipe(self, response): hxs = HtmlXPathSelector(response) for title in hxs.select(xp_title).extract(): break else: return [] ingredients = hxs.select(xp_ingredients).extract() return [ CocktailItem( title=re.sub(r'\s+recipe$', '', html_to_text(title)), picture=None, url=response.url, source='Drinks Mixer', ingredients=map(html_to_text, ingredients), ) ]
def parse_recipe(self, response): hxs = HtmlXPathSelector(response) for title in hxs.select('//h2').extract(): break else: return [] ingredients = hxs.select(xp_ingredients).extract() return [ CocktailItem( title=html_to_text(title), picture=None, url=response.url, source='CocktailDB', ingredients=map(html_to_text, ingredients), ) ]
def parse_recipe(self, response): hxs = HtmlXPathSelector(response) for title in hxs.select('//h1').extract(): break else: return [] for picture in hxs.select("//img[@itemprop='photo']/@src").extract(): picture = urljoin(response.url, picture) break else: picture = None ingredients = hxs.select("//*[@itemprop='ingredient']").extract() return [CocktailItem( title=html_to_text(title), picture=picture, url=response.url, ingredients=map(html_to_text, ingredients), )]
def parse_recipes(self, response, recipe_urls): hxs = HtmlXPathSelector(response) for url in recipe_urls: node = hxs.select("//*[@id='%s']" % urlparse(url).fragment)[0] for picture in node.select( './preceding-sibling::*[1]/img/@src').extract(): picture = urljoin(url, picture) break else: picture = None ingredients = node.select( './following-sibling::*[position()<=2]/li').extract() yield CocktailItem( title=html_to_text(node.extract()), picture=picture, url=url, source='Oh Gosh!', ingredients=map(html_to_text, ingredients), )