def parse_item(self, response): hxs = HtmlXPathSelector(response) image_path = hxs.select("descendant-or-self::img[@class and contains(@class, 'wp-image')][1]/@data-lazy-src").extract() raw_recipes = parse_recipes(hxs, {'source': self.source, 'url': response.url}) if raw_recipes: # schema.org. Yay! for recipe in raw_recipes: recipe['image'] = image_path return [RecipeItem.from_dict(recipe) for recipe in raw_recipes] else: # not schema.org. Boo! il = RecipeItemLoader(item=RecipeItem()) il.add_value('source', self.source) il.add_value('url', response.url) il.add_value('image', image_path) name_path = '//*[@class="post-title"]/h1/text()' il.add_value('name', hxs.select(name_path).extract()) # maybe it's in the P's for p in hxs.select('//div[@id="recipe" or @class="span9"]/p'): if is_ingredient_container(p): il.add_value('ingredients', p.select('text()').extract()) # or maybe it's in the LI's for li in hxs.select('//*[@class="span9"]//ul/li'): if is_ingredient_container(li): il.add_value('ingredients', li.select('text()').extract()) # or maybe it's in these other LI's for li in hxs.select('//li[@class="ingredient"]/text()'): il.add_value('ingredients', li.extract()) return il.load_item()
def parse_item(self, response): hxs = HtmlXPathSelector(response) base_path = '//*[@class="post hrecipe"]' recipes_scopes = hxs.select(base_path) name_path = '//*[@class="title fn"]/text()' image_path = '//*[@class="photo"]/@src' ingredients_path = '//ul[@class="ingredient_list"]/li/text()' recipes = [] for r_scope in recipes_scopes: il = RecipeItemLoader(item=RecipeItem()) il.add_value('source', self.source) il.add_value('name', r_scope.select(name_path).extract()) il.add_value('image', r_scope.select(image_path).extract()) il.add_value('url', response.url) il.add_value('ingredients', r_scope.select(ingredients_path).extract()) recipes.append(il.load_item()) return recipes
def parse_item(self, response): # we use this to run XPath commands against the HTML in the response hxs = HtmlXPathSelector(response) # this is the base XPath string for the element that contains the # recipe info base_path = """//div[@itemtype="http://schema.org/Recipe"]""" # the select() method will return a list of HtmlXPathSelector objects. # On this site we will almost certainly either get back just one, if # any exist on the page recipes_scopes = hxs.select(base_path) # it's easier to define these XPath strings outside of the loop below name_path = '//div[@itemprop="name"]/text() | //*[@itemprop="name"]//*[@class="fn"]/text()' description_path = '//div[@itemprop="description"]/text()' image_path = '//img[1]/@src' prepTime_path = '//time[@itemprop="prepTime"][contains(@datetime, "PT")]/@datetime | //time[@itemprop="prepTime"]//*[@class="value-title"]/@title' cookTime_path = '//time[@itemprop="cookTime"][contains(@datetime, "PT")]/@datetime | //time[@itemprop="cookTime"]//*[@class="value-title"]/@title' recipeYield_path = '//span[@itemprop="recipeYield"]/text()' ingredients_path = '//li[@itemprop="ingredients"]/text()' datePublished = '//abbr[@class="published"]/text()' # init an empty list recipes = [] # loop through our recipe scopes and extract the recipe data from each for r_scope in recipes_scopes: # make an empty RecipeItem il = RecipeItemLoader(item=RecipeItem()) il.add_value('source', self.source) il.add_value('name', r_scope.select(name_path).extract()) # There's a bunch of images for each recipe, so we just # grab the first. il.add_value('image', r_scope.select(image_path).extract()[1]) il.add_value('url', response.url) il.add_value('description', r_scope.select(description_path).extract()) il.add_value('prepTime', r_scope.select(prepTime_path).extract()) il.add_value('cookTime', r_scope.select(cookTime_path).extract()) il.add_value('recipeYield', r_scope.select(recipeYield_path).extract()) il.add_value('ingredients', r_scope.select(ingredients_path).extract()) il.add_value('datePublished', r_scope.select(datePublished).extract()) # stick this RecipeItem in the array of recipes we will return recipes.append(il.load_item()) # more processing is done by the openrecipes.pipelines. Look at that # file to see transforms that are applied to each RecipeItem return recipes
def parse_item(self, response): # we use this to run XPath commands against the HTML in the response hxs = HtmlXPathSelector(response) # this is the base XPath string for the element that contains the recipe # info base_path = """//article[@class="hrecipe"]""" # the select() method will return a list of HtmlXPathSelector objects. # On this site we will almost certainly either get back just one, if # any exist on the page recipes_scopes = hxs.select(base_path) # it's easier to define these XPath strings outside of the loop below name_path = '//h1/text()' recipeYield_path = '//span[@class="info yield"]/text()' image_path = '//section[@class="content-unit"]/img/@src' prepTime_path = '//span[@class="info preptime"]/text()' cookTime_path = '//span[@class="info duration"]/text()' ingredients_path = '//div[@class="ingredients-section"]/ul/li/span/text()' datePublished = '//footer/time/text()' # init an empty list recipes = [] # loop through our recipe scopes and extract the recipe data from each for r_scope in recipes_scopes: # make an empty RecipeItem il = RecipeItemLoader(item=RecipeItem()) il.add_value('source', self.source) il.add_value('name', r_scope.select(name_path).extract()) il.add_value('image', r_scope.select(image_path).extract()) il.add_value('url', response.url) il.add_value('prepTime', r_scope.select(prepTime_path).extract()) il.add_value('cookTime', r_scope.select(cookTime_path).extract()) il.add_value('recipeYield', r_scope.select(recipeYield_path).extract()) ingredient_scopes = r_scope.select(ingredients_path) ingredients = [] for i_scope in ingredient_scopes: ingredients.append(i_scope.extract()) il.add_value('ingredients', ingredients) il.add_value('datePublished', r_scope.select(datePublished).extract()) # stick this RecipeItem in the array of recipes we will return recipes.append(il.load_item()) # more processing is done by the openrecipes.pipelines. Look at that # file to see transforms that are applied to each RecipeItem return recipes
def parse_item(self, response): hxs = HtmlXPathSelector(response) base_path = '//*[@itemtype="http://schema.org/Recipe"]' recipes_scopes = hxs.select(base_path) name_path = '//*[@class="recipe-title"]/text()' # not sure how to get the description consistently on this one. #description_path = 'TODO' image_path = '//*[@itemprop="image"]/@src' prepTime_path = '//*[@class="prep-time tooltip-element"]/number()' cookTime_path = '//*[@class="total-time tooltip-element"]/text()' recipeYield_path = '//*[@itemprop="recipeYield"]/text()' #may have to make ingredients more generic ingredients_path = '//*[@class="ingredients-list"]/ul' datePublished = '//*[@class="date published time"]/text()' recipes = [] for r_scope in recipes_scopes: il = RecipeItemLoader(item=RecipeItem()) il.add_value('source', self.source) il.add_value('name', r_scope.select(name_path).extract()) il.add_value('image', r_scope.select(image_path).extract()) il.add_value('url', response.url) #il.add_value('description', r_scope.select(description_path).extract()) il.add_value('prepTime', r_scope.select(prepTime_path).extract()) il.add_value('cookTime', r_scope.select(cookTime_path).extract()) il.add_value('recipeYield', r_scope.select(recipeYield_path).extract()) ingredient_scopes = r_scope.select(ingredients_path) ingredients = [] for i_scope in ingredient_scopes: amount = i_scope.select( '//*[@class="ingredient-n"]/text()').extract() ingredient_unit = i_scope.select( '*//*[@class="ingredient-unit"]/text()').extract() name = i_scope.select( '//*[@class="ingredient-name"]/text()').extract() amount = "".join(amount).strip() ingredient_unit = "".join(ingredient_unit).strip() name = "".join(name).strip() ingredients.append("%s %s" % (amount, ingredient_unit, name)) il.add_value('ingredients', ingredients) il.add_value('datePublished', r_scope.select(datePublished).extract()) recipes.append(il.load_item()) return recipes
def parse_item(self, response): # we use this to run XPath commands against the HTML in the response hxs = HtmlXPathSelector(response) # this is the base XPath string for the element that contains the recipe # info base_path = """//*[@id="container"]/*[@class="onepage"]/div[1]/div[@class="content"]""" # the select() method will return a list of HtmlXPathSelector objects. # On this site we will almost certainly either get back just one, if # any exist on the page recipes_scopes = hxs.select(base_path) # it's easier to define these XPath strings outside of the loop below name_path = '//h1[@class="title"]/text() | //*[@class="content"]/p[@style="text-align: center;"]/following-sibling::p[strong]/strong/text()' image_path = '//*[@class="content"]/p[1]/img[contains(@class, "size-full")]/@src' recipeYield_path = '//*[@class="content"]/p[@style="text-align: center;"]/following-sibling::p[em and strong]/em/text()' datePublished = '//*[@class="phn-date"]/a[@rel="author"]/following-sibling::text()' # This site contains Ingredients and Garnishes, both "lists" are inside a <p> and separated # using <br>s. Also, we skip the <p> containing "EVENT VENUE PARTY SIZE TYPE MENU" by # grabbing <p>s that do not have <strong>, <a>, or <img> child elements ingredients_path = '//*[@class="content"]/p[not(strong or a or img) and br]/text()' # init an empty list recipes = [] # loop through our recipe scopes and extract the recipe data from each for r_scope in recipes_scopes: # make an empty RecipeItem il = RecipeItemLoader(item=RecipeItem()) il.add_value('source', self.source) il.add_value('name', r_scope.select(name_path).extract()) il.add_value('image', r_scope.select(image_path).extract()) il.add_value('url', response.url) il.add_value('recipeYield', r_scope.select(recipeYield_path).extract()) # date returns something like this: "ON SATURDAY NOV 28TH, 2009 |" date = r_scope.select(datePublished).extract() if len(date) > 0: date = date[0].replace('on', '', 1).replace('|', '').strip() il.add_value('datePublished', date) il.add_value('ingredients', r_scope.select(ingredients_path).extract()) # stick this RecipeItem in the array of recipes we will return recipes.append(il.load_item()) # more processing is done by the openrecipes.pipelines. Look at that # file to see transforms that are applied to each RecipeItem return recipes
def parse_item(self, response): hxs = HtmlXPathSelector(response) base_path = '//*[@class="recipe hrecipe"]' recipes_scopes = hxs.select(base_path) name_path = './/*[@class="fn"]/text()' #desription is pretty odd on this site. #description_path = 'TODO' image_path = '//div/p[1]//img/@src' prepTime_path = '//*[@class="preptime"]/text()' cookTime_path = '//*[@class="cooktime"]/text()' recipeYield_path = '//*[@class="yield"]/text()' ingredients_path = './/div[@class="ingredient"]/p/text()' #the formatting is odd, will need to learn more xpath to be able to select, just date datePublished = '//*[@class="postmeta"]/text()' recipes = [] label_regex = re.compile(r'^For ') for r_scope in recipes_scopes: il = RecipeItemLoader(item=RecipeItem()) il.add_value('source', self.source) il.add_value('name', r_scope.select(name_path).extract()) il.add_value('image', r_scope.select(image_path).extract()) il.add_value('url', response.url) #il.add_value('description', r_scope.select(description_path).extract()) il.add_value('prepTime', r_scope.select(prepTime_path).extract()) il.add_value('cookTime', r_scope.select(cookTime_path).extract()) il.add_value('recipeYield', r_scope.select(recipeYield_path).extract()) ingredient_scopes = r_scope.select(ingredients_path) ingredients = [] for i_scope in ingredient_scopes: ingredient = i_scope.extract().strip() if not label_regex.match( ingredient) and not ingredient.endswith(':'): ingredients.append(ingredient) il.add_value('ingredients', ingredients) il.add_value('datePublished', r_scope.select(datePublished).extract()) recipes.append(il.load_item()) return recipes
def parse_item(self, response): if '/ingredients/' in response.url or '/category/' in response.url: return [] hxs = HtmlXPathSelector(response) base_path = '//div[@class="blog"]' recipes_scopes = hxs.select(base_path) name_path = 'h1/a[@rel="bookmark"]/text()' description_path = '//meta[@property="og:description"]/@content' image_path = '//meta[@property="og:image"][1]/@content' recipeYield_path = './/*[@class="yield"]//text()[normalize-space()]' ingredients_path = './/*[@class="ingredient"]' datePublished = '//div[@class="blurb"]/strong/text()[1]' recipes = [] for r_scope in recipes_scopes: il = RecipeItemLoader(item=RecipeItem()) il.add_value('source', self.source) il.add_value('name', r_scope.select(name_path).extract()) il.add_value('image', r_scope.select(image_path).extract()) il.add_value('url', response.url) il.add_value('description', r_scope.select(description_path).extract()) il.add_value('recipeYield', ' '.join(r_scope.select(recipeYield_path).extract())) ingredient_scopes = r_scope.select(ingredients_path) ingredients = [] for ingredient_node in ingredient_scopes: ingredient = [ i.strip() for i in ingredient_node.select( './/text()[normalize-space()]').extract() ] ingredients.append(' '.join(ingredient)) il.add_value('ingredients', ingredients) datePublished = r_scope.select(datePublished).extract()[0] il.add_value( 'datePublished', datePublished.replace('Posted on', '').replace('in', '').strip()) recipes.append(il.load_item()) return recipes
def parse_item(self, response): hxs = HtmlXPathSelector(response) base_path = '//*[@class="hrecipe"]' recipes_scopes = hxs.select(base_path) name_path = '//h1[@class="fn"]/text()' #long descriptions used in website, also the description doesn't appear.. description_path = '//*[@class="format_text entry-content jpibfi_container"]/p/text()' #the end image url contains dimensions 150x150, not sure how to remove. image_path = '//*[@class="photo"]/@src' #prepTime_path = 'TODO' None given #cookTime_path = 'TODO' None given #recipeYield_path = 'TODO'None given ingredients_path = './/*[@class="ingredient"]/p/text()' datePublished = '//span[@class="published"]/text()' recipes = [] label_regex = re.compile(r'^For ') for r_scope in recipes_scopes: il = RecipeItemLoader(item=RecipeItem()) il.add_value('source', self.source) il.add_value('name', r_scope.select(name_path).extract()) il.add_value('image', r_scope.select(image_path).extract()) il.add_value('url', response.url) il.add_value('description', r_scope.select(description_path).extract()) #il.add_value('prepTime', r_scope.select(prepTime_path).extract()) #il.add_value('cookTime', r_scope.select(cookTime_path).extract()) #il.add_value('recipeYield', r_scope.select(recipeYield_path).extract()) ingredient_scopes = r_scope.select(ingredients_path) ingredients = [] for i_scope in ingredient_scopes: ingredient = i_scope.extract().strip() if not label_regex.match( ingredient) and not ingredient.endswith(':'): ingredients.append(ingredient) il.add_value('ingredients', ingredients) il.add_value('datePublished', r_scope.select(datePublished).extract()) recipes.append(il.load_item()) return recipes
def parse_item(self, response): # we use this to run XPath commands against the HTML in the response hxs = HtmlXPathSelector(response) # this is the base XPath string for the element that contains the recipe # info base_path = """//span[@class="hrecipe"]""" # the select() method will return a list of HtmlXPathSelector objects. # On this site we will almost certainly either get back just one, if # any exist on the page recipes_scopes = hxs.select(base_path) # it's easier to define these XPath strings outside of the loop below name_path = '//div[@class="content"]/header/h1[@class="fn"]/text()' description_path = '//article[@class="recipe_description"]//text()' image_path = '//div[@class="recipe_image_main"]/p/img/@src' recipeYield_path = '//div[@class="recipe_meta"]/p/span[contains(@class,"yield")]/text()' ingredients_path = '//article[@class="ingredients"]//ul//li/p[@class="ingredient"]/span[@class="value"]/text()' # init an empty list recipes = [] # loop through our recipe scopes and extract the recipe data from each for r_scope in recipes_scopes: # make an empty RecipeItem il = RecipeItemLoader(item=RecipeItem()) il.add_value('source', self.source) il.add_value('name', r_scope.select(name_path).extract()) il.add_value('image', urljoin(response.url, r_scope.select(image_path).extract().pop(0))) il.add_value('url', response.url) il.add_value('description', r_scope.select(description_path).extract()) # prepTime not available il.add_value('prepTime', None) # cookTime not available il.add_value('cookTime', None) il.add_value('recipeYield', r_scope.select(recipeYield_path).extract()) il.add_value('ingredients', r_scope.select(ingredients_path).extract()) # datePublished not available il.add_value('datePublished', None) # stick this RecipeItem in the array of recipes we will return recipes.append(il.load_item()) # more processing is done by the openrecipes.pipelines. Look at that # file to see transforms that are applied to each RecipeItem return recipes
def parse_item(self, response): hxs = HtmlXPathSelector(response) base_path = '//*[@class="recipe hrecipe"]' recipes_scopes = hxs.select(base_path) name_path = '//*[@class="fn"]/text()' description_path = '//*[@class="summary"]/p/text()' image_path = '//p[1]/span/img/@src' prepTime_path = '//*[@class="preptime"]/text()' cookTime_path = './/*[@class="cooktime"]/text()' recipeYield_path = '//*[@class="yield"]/text()' ingredients_path = './/*[@class="ingredient"]/p/text()' #same formatting as forthelovecooking, so kind of odd. #datePublished = 'TODO' recipes = [] label_regex = re.compile(r'^For ') for r_scope in recipes_scopes: il = RecipeItemLoader(item=RecipeItem()) il.add_value('source', self.source) il.add_value('name', r_scope.select(name_path).extract()) il.add_value('image', r_scope.select(image_path).extract()) il.add_value('url', response.url) il.add_value('description', r_scope.select(description_path).extract()) il.add_value('prepTime', r_scope.select(prepTime_path).extract()) il.add_value('cookTime', r_scope.select(cookTime_path).extract()) il.add_value('recipeYield', r_scope.select(recipeYield_path).extract()) ingredient_scopes = r_scope.select(ingredients_path) ingredients = [] for i_scope in ingredient_scopes: ingredient = i_scope.extract().strip() if not label_regex.match( ingredient) and not ingredient.endswith(':'): ingredients.append(ingredient) il.add_value('ingredients', ingredients) #il.add_value('datePublished', r_scope.select(datePublished).extract()) recipes.append(il.load_item()) return recipes
def parse_item(self, response): hxs = HtmlXPathSelector(response) base_path = '//*[@id="recipe"]' recipes_scopes = hxs.select(base_path) name_path = '//*[@class="row page_title clearfix"]/h2/text()' description_path = '//*[@class="entry"]/p//text()' image_path = '//*[@class="featured_image"]/img[@class="image"]/@src' recipeYield_path = '//*[@class="breakdown"]/tbody/tr[1]/td[1]/text()' ingredients_path = '*//*[@class="ingredients"]' #the site only offers total time, so prep and cook is combined #prepTime_path = '' # timezone warning, that is over my head at this point #cookTime_path = '//*[@class="cook_time"]' # datePublished = 'TODO' not available recipes = [] for r_scope in recipes_scopes: il = RecipeItemLoader(item=RecipeItem()) #il.add_value('source', self.source) il.add_value('name', r_scope.select(name_path).extract()) il.add_value('image', r_scope.select(image_path).extract()) il.add_value('url', response.url) il.add_value('description', r_scope.select(description_path).extract()) # il.add_value('prepTime', r_scope.select(prepTime_path).extract()) #il.add_value('cookTime', r_scope.select(cookTime_path).extract()) il.add_value('recipeYield', r_scope.select(recipeYield_path).extract()) #il.add_value('ingredients', r_scope.select(ingredients_path).extract()) ingredient_scopes = r_scope.select(ingredients_path) ingredients = [] for i_scope in ingredient_scopes: amount = i_scope.select('//td/strong').extract() name = i_scope.select( '//*[@class="ingredients"]/tbody/tr/td/text()').extract() amount = "".join(amount).strip() name = "".join(name).strip() ingredients.append("%s %s" % (amount, name)) il.add_value('ingredients', ingredients) # il.add_value('datePublished', r_scope.select(datePublished).extract()) recipes.append(il.load_item()) return recipes
def parse_item(self, response): hxs = HtmlXPathSelector(response) base_path = """//div[@id="recipe"]""" recipes_scopes = hxs.select(base_path) name_path = 'h1/text()' description_path = '//meta[@property="og:description"]/@content' url_path = '//meta[@property="og:url"]/@content' image_path = '//meta[@property="og:image"][1]/@content' prepTime_path = './/span[@class="preptime"]/span[@class="value-title"]/@title' cookTime_path = './/span[@class="cooktime"]/span[@class="value-title"]/@title' # super inconsistent in how the yield is formatted recipeYield_path = "|".join([ '//div[@id="recipe"]/p[starts-with(i,"Makes")]/i', '//div[@id="recipe"]/p[starts-with(i,"Serves")]/i', '//div[@id="recipe"]/p[starts-with(em,"Makes")]/em', '//div[@id="recipe"]/p[starts-with(em,"Serves")]/em', '//div[@id="recipe"][starts-with(p,"Makes")]/p', '//div[@id="recipe"][starts-with(p,"Serves")]/p', ]) ingredients_path = 'blockquote/*' datePublished = '//span[@class="published"]/span[@class="value-title"]/@title' recipes = [] for r_scope in recipes_scopes: il = RecipeItemLoader(item=RecipeItem()) il.add_value('source', self.source) il.add_value('name', r_scope.select(name_path).extract()) il.add_value('image', r_scope.select(image_path).extract()) il.add_value('url', r_scope.select(url_path).extract()) il.add_value('description', r_scope.select(description_path).extract()) il.add_value('prepTime', r_scope.select(prepTime_path).extract()) il.add_value('cookTime', r_scope.select(cookTime_path).extract()) il.add_value('recipeYield', r_scope.select(recipeYield_path).extract()) il.add_value('ingredients', r_scope.select(ingredients_path).extract()) il.add_value('datePublished', r_scope.select(datePublished).extract()) recipes.append(il.load_item()) return recipes
def parse_item(self, response): hxs = HtmlXPathSelector(response) # site has many recipes missing the semantmic markup, but not worth # pursuing those IMHO. use hrecipe base_path = """//*[@class="hrecipe"]""" recipes_scopes = hxs.select(base_path) name_path = './/*[@class="fn"]/text()' url_path = '//meta[@property="og:url"]/@content' image_path = '//meta[@property="og:image"][1]/@content' recipeYield_path = './/*[@class="yield"]/text()' ingredients_path = '*//*[@class="ingredient"]' # get the date from rest of page, not under hrecipe datePublished_path = '//*[@class="date"][1]' recipes = [] for r_scope in recipes_scopes: il = RecipeItemLoader(item=RecipeItem()) il.add_value('source', self.source) il.add_value('name', r_scope.select(name_path).extract()) il.add_value('image', r_scope.select(image_path).extract()) il.add_value('url', r_scope.select(url_path).extract()) il.add_value('recipeYield', r_scope.select(recipeYield_path).extract()) ingredient_scopes = r_scope.select(ingredients_path) ingredients = [] for i_scope in ingredient_scopes: amount = i_scope.select('*[@class="amount"]/text()').extract() name = i_scope.select('*[@class="name"]/text()').extract() amount = "".join(amount).strip() name = "".join(name).strip() ingredients.append("%s %s" % (amount, name)) il.add_value('ingredients', ingredients) il.add_value('datePublished', r_scope.select(datePublished_path).extract()) recipes.append(il.load_item()) return recipes
def parse_item(self, response): hxs = HtmlXPathSelector(response) base_path = '//div[@class="innerrecipe"]' recipes_scopes = hxs.select(base_path) name_path = '*//h2[@class="fn"]/text()' image_path = '*//img[@class="photo"]/@src' prepTime_path = '*//span[@class="preptime"]/text()' cookTime_path = '*//span[@class="cooktime"]/text()' totalTime_path = '*//span[@class="duration"]/text()' recipeYield_path = '*//span[@class="yield"]/text()' datePublished = '//div[@class="post fullpost singlepost"]//div[@class="postmeta"]/text()[normalize-space()]' ingredients_path = '*//*[@class="ingredient"]/p' recipes = [] for r_scope in recipes_scopes: il = RecipeItemLoader(item=RecipeItem()) il.add_value('source', self.source) il.add_value('name', r_scope.select(name_path).extract()) il.add_value('image', r_scope.select(image_path).extract()) il.add_value('url', response.url) il.add_value('prepTime', r_scope.select(prepTime_path).extract()) il.add_value('cookTime', r_scope.select(cookTime_path).extract()) il.add_value('totalTime', r_scope.select(totalTime_path).extract()) il.add_value('recipeYield', r_scope.select(recipeYield_path).extract()) ingredient_scopes = r_scope.select(ingredients_path) ingredients = [] for i_scope in ingredient_scopes: ingredient = i_scope.select('text()').extract() ingredient = "".join(ingredient) ingredients.append(ingredient) il.add_value('ingredients', ingredients) il.add_value('datePublished', r_scope.select(datePublished).extract()) recipes.append(il.load_item()) return recipes
def parse_item(self, response): hxs = HtmlXPathSelector(response) data = {'url': response.url, 'source': self.source} recipe = RecipeItem.from_dict(parse_recipe(hxs, data)) loader = RecipeItemLoader(item=recipe) loader.add_value( 'image', select_class(hxs, 'post_image').select('@src').extract()) loader.add_value( 'description', hxs.select('//meta[@name="description"]/@content').extract()) loader.add_value( 'name', select_class(hxs, 'entry-title').select('text()').extract()) return [loader.load_item()]
def parse_item(self, response): hxs = HtmlXPathSelector(response) base_path = '//*[@itemtype="http://schema.org/Recipe"]' recipes_scopes = hxs.select(base_path) name_path = '//*[@itemprop="name"]/text()' description_path = '//*[@itemprop="description"]/text()' url_path = '//meta[@property="og:url"]/@content' image_path = '//*[@itemprop="image"]/@src' recipeYield_path = '*//*[@itemprop="recipeYield"]/text()' prepTime_path = '//*[@itemprop="prepTime"]' cookTime_path = '//*[@itemprop="cookTime"]' ingredients_path = '//*[@itemprop="ingredients"]' recipes = [] for r_scope in recipes_scopes: il = RecipeItemLoader(item=RecipeItem()) il.add_value('source', 'allrecipes') il.add_value('name', r_scope.select(name_path).extract()) il.add_value('image', r_scope.select(image_path).extract()) il.add_value('url', r_scope.select(url_path).extract()) il.add_value('description', r_scope.select(description_path).extract()) prepTime = r_scope.select(prepTime_path) il.add_value('prepTime', parse_iso_date(prepTime)) cookTime = r_scope.select(cookTime_path) il.add_value('cookTime', parse_iso_date(cookTime)) il.add_value('recipeYield', r_scope.select(recipeYield_path).extract()) ingredient_scopes = r_scope.select(ingredients_path) ingredients = [] for i_scope in ingredient_scopes: components = i_scope.select('node()/text()').extract() ingredients.append(' '.join(components)) il.add_value('ingredients', ingredients) recipes.append(il.load_item()) return recipes
def parse_item(self, response): hxs = HtmlXPathSelector(response) base_path = 'TODO' recipes_scopes = hxs.select(base_path) name_path = 'TODO' description_path = 'TODO' image_path = 'TODO' prepTime_path = 'TODO' cookTime_path = 'TODO' recipeYield_path = 'TODO' ingredients_path = 'TODO' datePublished = 'TODO' recipes = [] for r_scope in recipes_scopes: il = RecipeItemLoader(item=RecipeItem()) il.add_value('source', self.source) il.add_value('name', r_scope.select(name_path).extract()) il.add_value('image', r_scope.select(image_path).extract()) il.add_value('url', response.url) il.add_value('description', r_scope.select(description_path).extract()) il.add_value('prepTime', r_scope.select(prepTime_path).extract()) il.add_value('cookTime', r_scope.select(cookTime_path).extract()) il.add_value('recipeYield', r_scope.select(recipeYield_path).extract()) ingredient_scopes = r_scope.select(ingredients_path) ingredients = [] for i_scope in ingredient_scopes: pass il.add_value('ingredients', ingredients) il.add_value('datePublished', r_scope.select(datePublished).extract()) recipes.append(il.load_item()) return recipes
def clean_item(old_dict): # copy this so we have an unmodified version source_dict = dict(old_dict) # remove ts and _id fields from what we pass to loader del source_dict['ts'] del source_dict['_id'] if VERBOSE: print "Examining '%s' from '%s' (%s)..." % ( old_dict['name'], old_dict['source'], old_dict['_id']) loader = RecipeItemLoader(RecipeItem()) for k, v in source_dict.iteritems(): loader = set_value(loader, k, v) new_item = loader.load_item() return new_item, source_dict
def parse_item(self, response): hxs = HtmlXPathSelector(response) base_path = """//div[@id="zlrecipe-innerdiv"]""" recipes_scopes = hxs.select(base_path) name_path = '*//*[@itemprop="name"]/text()' url_path = '//link[@rel="canonical"]/@href' image_path = '//meta[@property="og:image"][1]/@content' prepTime_path = '*//*[@itemprop="prepTime"]/@content' cookTime_path = '*//*[@itemprop="cookTime"]/@content' recipeYield_path = '*//*[@itemprop="recipeYield"]/text()' ingredients_path = '*//*[@itemprop="ingredients"]' datePublished = '//*[@class="time_stamp_month"]' recipes = [] for r_scope in recipes_scopes: il = RecipeItemLoader(item=RecipeItem()) il.add_value('source', self.source) il.add_value('name', r_scope.select(name_path).extract()) il.add_value('image', r_scope.select(image_path).extract()) il.add_value('url', r_scope.select(url_path).extract()) il.add_value('prepTime', r_scope.select(prepTime_path).extract()) il.add_value('cookTime', r_scope.select(cookTime_path).extract()) il.add_value('recipeYield', r_scope.select(recipeYield_path).extract()) ingredient_scopes = r_scope.select(ingredients_path) ingredients = [] for i_scope in ingredient_scopes: ind = i_scope.select('.//text()').extract() ingredients.append(''.join(ind).strip()) il.add_value('ingredients', ingredients) il.add_value('datePublished', r_scope.select(datePublished).extract()) recipes.append(il.load_item()) return recipes
def parse_item(self, response): hxs = HtmlXPathSelector(response) base_path = """//div[@id="blq-main"]""" recipes_scopes = hxs.select(base_path) name_path = '//h1/text()' description_path = '//div[@id="description"]//span[@class="summary"]/text()' image_path = '//img[@id="food-image"]/@src' prepTime_path = '//span[@class="prepTime"]/span[@class="value-title"]/@title' cookTime_path = '//span[@class="cookTime"]/span[@class="value-title"]/@title' recipeYield_path = '//h3[@class="yield"]/text()' ingredients_path = '//p[@class="ingredient"]' recipes = [] for r_scope in recipes_scopes: il = RecipeItemLoader(item=RecipeItem()) il.add_value('source', self.source) il.add_value('name', r_scope.select(name_path).extract()) il.add_value('image', r_scope.select(image_path).extract()) il.add_value('url', response.url) il.add_value('description', r_scope.select(description_path).extract()) il.add_value('prepTime', r_scope.select(prepTime_path).extract()) il.add_value('cookTime', r_scope.select(cookTime_path).extract()) il.add_value('recipeYield', r_scope.select(recipeYield_path).extract()) ingredient_scopes = r_scope.select(ingredients_path) ingredients = [] for i_scope in ingredient_scopes: amount = i_scope.select('text()[1]').extract() name = i_scope.select('a/text()').extract() amount = "".join(amount).strip() name = "".join(name).strip() ingredients.append("%s %s" % (amount, name)) il.add_value('ingredients', ingredients) recipes.append(il.load_item()) return recipes
def parse_item(self, response): hxs = HtmlXPathSelector(response) base_path = """//*[contains(@class,'hrecipe')]""" recipes_scopes = hxs.select(base_path) name_path = '//meta[@property="og:title"]/@content' description_path = '//meta[@name="description"]/@content' url_path = '//meta[@property="og:url"]/@content' image_path = '//*[@itemprop="image"]/@src' recipeYield_path = '//div[@class="time-and-yield"]/*/span[@class="yield"]/text()' ingredients_path = '//ul[@class="ingredients"]/li/span[@class="ingredient"]' datePublished_path = '//div[@class="intro"]/div[@class="display-date"]/text()[last()]' # skip HTML comment recipes = [] for r_scope in recipes_scopes: il = RecipeItemLoader(item=RecipeItem()) il.add_value('source', self.source) il.add_value('name', r_scope.select(name_path).extract()) il.add_value('image', r_scope.select(image_path).extract()) il.add_value('url', r_scope.select(url_path).extract()) il.add_value('description', r_scope.select(description_path).extract()) il.add_value('recipeYield', r_scope.select(recipeYield_path).extract()) ingredients_scope = r_scope.select(ingredients_path) ingredients = [] for i_scope in ingredients_scope: quantity = i_scope.select( 'span[@class="quantity"]/text()').extract() name = i_scope.select('span[@class="name"]/text()').extract() quantity = "".join(quantity).strip() name = "".join(name).strip() ingredients.append("%s %s" % (quantity, name)) il.add_value('ingredients', ingredients) il.add_value('datePublished', r_scope.select(datePublished_path).extract()) recipes.append(il.load_item()) return recipes
def parse_item(self, response): hxs = HtmlXPathSelector(response) base_path = '//div[@id="content"]' recipes_scopes = hxs.select(base_path) name_path = './/span[@class="item"]/h2[@class="fn"]/text()' image_path = "descendant-or-self::img[@class and contains(concat(' ', normalize-space(@class), ' '), ' size-full ')][1]/@src" prepTime_path = './/span[@class="preptime"]/text()' cookTime_path = './/span[@class="cooktime"]/text()' recipeYield_path = './/span[@class="yield"]/text()' ingredients_path = './/div[@class="ingredient"]/p/text()' recipes = [] label_regex = re.compile(r'^For ') for r_scope in recipes_scopes: il = RecipeItemLoader(item=RecipeItem()) il.add_value('source', self.source) il.add_value('name', r_scope.select(name_path).extract()) il.add_value('image', r_scope.select(image_path).extract()) il.add_value('url', response.url) il.add_value('prepTime', r_scope.select(prepTime_path).extract()) il.add_value('cookTime', r_scope.select(cookTime_path).extract()) il.add_value('recipeYield', r_scope.select(recipeYield_path).extract()) ingredient_scopes = r_scope.select(ingredients_path) ingredients = [] for i_scope in ingredient_scopes: ingredient = i_scope.extract().strip() if not label_regex.match( ingredient) and not ingredient.endswith(':'): ingredients.append(ingredient) il.add_value('ingredients', ingredients) recipes.append(il.load_item()) return recipes
def parse_item(self, response): hxs = HtmlXPathSelector(response) base_path = '//*[@id="recipe"]' recipes_scopes = hxs.select(base_path) name_path = '//*[@class="row page_title clearfix"]/h2/text()' description_path = '//*[@class="entry"]/p//text()' image_path = '//*[@class="featured_image"]/img[@class="image"]/@src' recipeYield_path = '//*[@class="breakdown"]/tbody/tr[1]/td[1]/text()' ingredients_path = '//*[@class="ingredients"]/tr' recipes = [] for r_scope in recipes_scopes: il = RecipeItemLoader(item=RecipeItem()) il.add_value('source', self.source) il.add_value('name', r_scope.select(name_path).extract()) il.add_value('image', r_scope.select(image_path).extract()) il.add_value('url', response.url) il.add_value('description', r_scope.select(description_path).extract()) il.add_value('recipeYield', r_scope.select(recipeYield_path).extract()) # this gives us a list of TRs ingredient_scopes = r_scope.select(ingredients_path) ingredients = [] # iterate over each TR scope and extract out the TDs + combine # the HTML will stripped in the pipeline for i_scope in ingredient_scopes: ingr_row = i_scope.select('td').extract() ingredient_str = " ".join(ingr_row).strip() ingredients.append(ingredient_str) il.add_value('ingredients', ingredients) recipes.append(il.load_item()) return recipes
def parse_item(self, response): hxs = HtmlXPathSelector(response) base_path = """//blockquote[@class="recipe"]""" recipes_scopes = hxs.select(base_path) name_path = '//meta[@property="og:title"]/@content' url_path = '//meta[@property="og:url"]/@content' description_path = '//meta[@property="og:description"]/@content' image_path = '//meta[@property="og:image"][1]/@content' prepTime_path = '*//*[@itemprop="prepTime"]/@content' cookTime_path = '*//*[@itemprop="cookTime"]/@content' recipeYield_path = '*//*[@itemprop="recipeYield"]/text()' ingredients_path = '*//*[@itemprop="ingredients"]' datePublished = '//p[@class="date"]/text()' recipes = [] for r_scope in recipes_scopes: il = RecipeItemLoader(item=RecipeItem()) il.add_value('source', self.source) il.add_value('name', r_scope.select(name_path).extract()) il.add_value('image', r_scope.select(image_path).extract()) il.add_value('url', r_scope.select(url_path).extract()) il.add_value('description', r_scope.select(description_path).extract()) il.add_value('prepTime', r_scope.select(prepTime_path).extract()) il.add_value('cookTime', r_scope.select(cookTime_path).extract()) il.add_value('recipeYield', r_scope.select(recipeYield_path).extract()) ingredient_scopes = r_scope.select(ingredients_path) ingredients = [] for i_scope in ingredient_scopes: ind = i_scope.extract() ind = ind.strip() ingredients.append("%s " % (ind)) il.add_value('ingredients', ingredients) il.add_value('datePublished', r_scope.select(datePublished).extract()) recipes.append(il.load_item()) return recipes
def parse_item(self, response): hxs = HtmlXPathSelector(response) base_path = '//div[@class="post"]' recipes_scopes = hxs.select(base_path) name_path = 'h2/a[@rel="bookmark"]/text()' image_path = '(//div[@class="entry"]/p/a[@title]/img/@src)[1]' description_path = 'div[@class="entry"]/text()' ingredients_path = 'div[@class="entry"]/p' datePublished = 'div[@class="date"]/text()' recipes = [] for r_scope in recipes_scopes: il = RecipeItemLoader(item=RecipeItem()) il.add_value('source', self.source) il.add_value('name', r_scope.select(name_path).extract()) il.add_value('image', r_scope.select(image_path).extract()) il.add_value('url', response.url) il.add_value( 'description', ''.join(r_scope.select(description_path).extract()).strip()) ingredient_scopes = r_scope.select(ingredients_path) ingredients = [] for i_scope in ingredient_scopes: if ingredient_heuristic(i_scope) > RECIPE_THRESHOLD: for ingredient in i_scope.select('text()'): ingredients.append(ingredient.extract().strip()) il.add_value('ingredients', ingredients) il.add_value('datePublished', r_scope.select(datePublished).extract()) recipes.append(il.load_item()) return recipes
def parse_item(self, response): hxs = HtmlXPathSelector(response) base_path = '//article[@itemtype="http://data-vocabulary.org/Recipe"]' recipes_scopes = hxs.select(base_path) name_path = '//h1[@itemprop="name"]/text()' description_path = '//meta[@name="description"]/@content' image_path = '//img[@itemprop="photo"]/@src' prepTime_path = '//span[@itemprop="prepTime"]/text()' cookTime_path = '//span[@itemprop="cookTime"]/text()' recipeYield_path = '//span[@itemprop="yield"]/text()' ingredients_path = '//li[@itemprop="ingredient"]' ingredients_amounts_path = './span[@itemprop="amount"]/span/text()' ingredients_names_path = './span[@itemprop="name"]/text()' recipes = [] for r_scope in recipes_scopes: il = RecipeItemLoader(item=RecipeItem()) il.add_value('source', self.source) il.add_value('name', r_scope.select(name_path).extract()) il.add_value('image', r_scope.select(image_path).extract()) il.add_value('url', response.url) il.add_value('description', r_scope.select(description_path).extract()) il.add_value('prepTime', r_scope.select(prepTime_path).extract()) il.add_value('cookTime', r_scope.select(cookTime_path).extract()) il.add_value('recipeYield', r_scope.select(recipeYield_path).extract()) # then combine them into a string. ingredient_scopes = r_scope.select(ingredients_path) amount = ingredient_scopes.select(ingredients_amounts_path).extract() name = ingredient_scopes.select(ingredients_names_path).extract() ingredients = [" ".join(ing).encode('utf-8') for ing in zip(amount, name)] il.add_value('ingredients', ingredients) recipes.append(il.load_item()) return recipes
def parse_item(self, response): hxs = HtmlXPathSelector(response) base_path = '//html' recipes_scopes = hxs.select(base_path) name_path = './/p[@id="title"]/text()' description_path = "descendant-or-self::p[@class and contains(concat(' ', normalize-space(@class), ' '), ' summary ')]/text()" image_path = './/img[@class="photo"]/@src' prepTime_path = './/span[@class="preptime"]/text()' cookTime_path = './/span[@class="cooktime"]/text()' recipeYield_path = './/p[@id="ingr_header"]/span[@class="single_recipe_text"]/text()' ingredients_path = './/li[@class="ingredient"]/text()' recipes = [] for r_scope in recipes_scopes: il = RecipeItemLoader(item=RecipeItem()) il.add_value('source', self.source) il.add_value('name', r_scope.select(name_path).extract()) il.add_value('image', r_scope.select(image_path).extract()) il.add_value('url', response.url) il.add_value('description', r_scope.select(description_path).extract()) il.add_value('prepTime', r_scope.select(prepTime_path).extract()) il.add_value('cookTime', r_scope.select(cookTime_path).extract()) il.add_value('recipeYield', r_scope.select(recipeYield_path).extract()) ingredient_scopes = r_scope.select(ingredients_path) ingredients = [] for i_scope in ingredient_scopes: ingredients.append(i_scope.extract()) il.add_value('ingredients', ingredients) recipes.append(il.load_item()) return recipes
def parse_item(self, response): hxs = HtmlXPathSelector(response) base_path = '//div[@itemtype="http://schema.org/Recipe"]' recipes_scope = hxs.select(base_path) ingredients_path = '//li[@itemprop="ingredients"]/text()' image_path = '(//div[@class="entry"]//img/@src)[1]' name_path = '//div[@itemprop="name"]/text()' url_path = '//h2[@class="title"]/a/@href | //link[@rel="canonical"]/@href' yield_path = '//span[@itemprop="servingSize"]/text()' total_time_path = '//span[@itemprop="totalTime"]/@content' recipes = [] for recipe_scope in recipes_scope: il = RecipeItemLoader(item=RecipeItem()) il.add_value('source', self.source) il.add_value('image', recipe_scope.select(image_path).extract()) il.add_value('name', recipe_scope.select(name_path).extract()) il.add_value('url', recipe_scope.select(url_path).extract()) ingredients = [] ingredient_scopes = recipe_scope.select(ingredients_path) for ingredient_scope in ingredient_scopes: ingredient = ingredient_scope.extract().strip() if (ingredient): ingredients.append(ingredient) il.add_value('ingredients', ingredients) il.add_value('recipeYield', recipe_scope.select(yield_path).extract()) il.add_value('totalTime', recipe_scope.select(total_time_path).extract()) recipes.append(il.load_item()) return recipes
def parse_item(self, response): hxs = HtmlXPathSelector(response) base_path = '//blockquote[@class="recipe hrecipe"]' recipes_scopes = hxs.select(base_path) name_path = '//*[@class="fn"]/text()' description_path = '//*[@class="summary"]/p/text()' image_path = '//img[@class="photo"]/@src' prepTime_path = '//*[@class="preptime"]/text()' cookTime_path = '//*[@class="cooktime"]/text()' recipeYield_path = '//*[@class="yield"]/text()' ingredients_path = '//*[@class="ingredient"]/p/text()' recipes = [] for r_scope in recipes_scopes: il = RecipeItemLoader(item=RecipeItem()) il.add_value('source', self.source) il.add_value('name', r_scope.select(name_path).extract()) il.add_value('image', r_scope.select(image_path).extract()) il.add_value('url', response.url) il.add_value('description', r_scope.select(description_path).extract()) il.add_value('prepTime', r_scope.select(prepTime_path).extract()) il.add_value('cookTime', r_scope.select(cookTime_path).extract()) il.add_value('recipeYield', r_scope.select(recipeYield_path).extract()) il.add_value('ingredients', r_scope.select(ingredients_path).extract()) recipes.append(il.load_item()) return recipes