def parse_recipe(self, response): hxs = HtmlXPathSelector(response) for title in hxs.select('//h1').extract(): break else: return [] for picture in hxs.select("//img[@itemprop='photo']/@src").extract(): picture = urljoin(response.url, picture) break else: picture = None ingredients, extra_ingredients = extract_extra_ingredients( ( split_at_br(hxs.select( "//node()" "[preceding::h4[" "starts-with(text(),'INGREDIENTS') or " "starts-with(text(),'Ingredients') or " "starts-with(text(),'ingredients')" "]]" "[following::h4[" "starts-with(text(),'INSTRUCTIONS') or " "starts-with(text(),'Instructions') or " "starts-with(text(),'instructions') or" "starts-with(text(),'DIRECTIONS') or " "starts-with(text(),'Directions') or " "starts-with(text(),'directions')" "]]" )) or hxs.select('//div[count(*)=1]/b').extract() or split_at_br(hxs.select('//b//node()')) or hxs.select("//span[@style='font-weight: bold;']").extract() ), lambda s: s.isupper() ) if not ingredients: return [] return [CocktailItem( title=html_to_text(title).strip(), picture=picture, url=response.url, source='Saveur', ingredients=ingredients, extra_ingredients=extra_ingredients )]
def parse(self, response): hxs = HtmlXPathSelector(response) for title in hxs.select("//strong[normalize-space(text()) != '']"): lines = split_at_br(title.select("ancestor-or-self::node()/following-sibling::node()[not(self::span[starts-with(text(), 'Stir')])]"), include_blank=True) ingredients = [] for line in lines[1 + (not lines[1][:1].isdigit()):]: line = html_to_text(line).strip() if not line: break if re.search(r'\b(?:shaken?|stir(?:red)?|fill glass|preparation)\b', line, re.I): break ingredients.append(line) yield CocktailItem( title=html_to_text(title.extract()).strip().rstrip('*').title(), picture=None, url=response.url, source="Dale DeGroff's", ingredients=ingredients )
def parse_archive_recipes(self, response, scraped_recipes): hxs = HtmlXPathSelector(response) for i, title_node in enumerate(hxs.select('//u[b][not(parent::div)] | //div[u[b]]')): title = html_to_text(title_node.extract()).strip().strip('.').title() if title.lower() in scraped_recipes: continue ingredients = [] for line in split_at_br(title_node.select('./following-sibling::node()[not(preceding::u[b][%d])]' % (i + 2)), include_blank=True, newline_elements=['br', 'div', 'b']) + ['']: line = html_to_text(line).strip() if not line: if len(ingredients) == 1: ingredients = [] if ingredients: break continue ingredients.append(line) if not ingredients: continue yield CocktailItem( title=title, picture=None, url=response.url, source="Dr. Adam Elmegirab's", ingredients=ingredients )
def parse(self, response): hxs = HtmlXPathSelector(response) for title in hxs.select("//strong[normalize-space(text()) != '']"): lines = split_at_br(title.select( "ancestor-or-self::node()/following-sibling::node()[not(self::span[starts-with(text(), 'Stir')])]" ), include_blank=True) ingredients = [] for line in lines[1 + (not lines[1][:1].isdigit()):]: line = html_to_text(line).strip() if not line: break if re.search( r'\b(?:shaken?|stir(?:red)?|fill glass|preparation)\b', line, re.I): break ingredients.append(line) yield CocktailItem(title=html_to_text( title.extract()).strip().rstrip('*').title(), picture=None, url=response.url, source="Dale DeGroff's", ingredients=ingredients)
def parse_recipe(self, response): hxs = HtmlXPathSelector(response) for title in hxs.select('//h1').extract(): break else: return [] for picture in hxs.select("//img[@itemprop='photo']/@src").extract(): picture = urljoin(response.url, picture) break else: picture = None ingredients, extra_ingredients = extract_extra_ingredients( (split_at_br( hxs.select("//node()" "[preceding::h4[" "starts-with(text(),'INGREDIENTS') or " "starts-with(text(),'Ingredients') or " "starts-with(text(),'ingredients')" "]]" "[following::h4[" "starts-with(text(),'INSTRUCTIONS') or " "starts-with(text(),'Instructions') or " "starts-with(text(),'instructions') or" "starts-with(text(),'DIRECTIONS') or " "starts-with(text(),'Directions') or " "starts-with(text(),'directions')" "]]")) or hxs.select('//div[count(*)=1]/b').extract() or split_at_br(hxs.select('//b//node()')) or hxs.select("//span[@style='font-weight: bold;']").extract()), lambda s: s.isupper()) if not ingredients: return [] return [ CocktailItem(title=html_to_text(title).strip(), picture=picture, url=response.url, source='Saveur', ingredients=ingredients, extra_ingredients=extra_ingredients) ]
def parse_recipe(self, response, num_recipes, scraped_recipes): hxs = HtmlXPathSelector(response) ingredients = [] for paragraph in hxs.select('//p'): l = [] for line in split_at_br(paragraph, include_blank=True) + ['']: line = html_to_text(line).strip() if line: l.append(line) continue if len(l) >= len(ingredients): ingredients = l paragraph_with_ingredients = paragraph l = [] title = hxs.select("//text()[contains(self::text(), ' such as the ')]") if title: title = html_to_text(title[0].extract()) title = re.search(r'(?<= such as the ).+?(?=,|;| created )', title).group(0) else: title = paragraph_with_ingredients.select('./preceding-sibling::p')[-1] title = html_to_text(title.extract()).rstrip(';') yield CocktailItem( title=title, picture=None, url=response.url, source="Dr. Adam Elmegirab's", ingredients=ingredients ) scraped_recipes.add(title.lower()) if len(scraped_recipes) == num_recipes: yield Request( urljoin( response.url, hxs.select("//a[text() = 'Archives']/@href")[0].extract() ), partial( self.parse_archive, scraped_recipes=scraped_recipes ) )
def parse_recipe(self, response): hxs = HtmlXPathSelector(response) for title in hxs.select(xp_title).extract(): break else: return [] ingredients = [] for ingredient in split_at_br(hxs.select(xp_ingredients)): if not ingredient.endswith(':'): ingredients.append(html_to_text(ingredient)) return [CocktailItem( title=html_to_text(title).split(':')[-1].split('\u2013')[-1].strip(), picture=None, url=response.url, source='Monkey 47 Blog', ingredients=ingredients )]
def parse_recipe(self, response): hxs = HtmlXPathSelector(response) for title in hxs.select(xp_title).extract(): break else: return [] ingredients = [] for ingredient in split_at_br(hxs.select(xp_ingredients)): if not ingredient.endswith(':'): ingredients.append(html_to_text(ingredient)) return [ CocktailItem(title=html_to_text(title).split(':')[-1].split( u'\u2013')[-1].strip(), picture=None, url=response.url, source='Monkey 47 Blog', ingredients=ingredients) ]