示例#1
0
def clean_item(old_dict):
    # copy this so we have an unmodified version
    source_dict = dict(old_dict)
    # remove ts and _id fields from what we pass to loader
    del source_dict['ts']
    del source_dict['_id']

    if VERBOSE:
        print "Examining '%s' from '%s' (%s)..." % (
            old_dict['name'], old_dict['source'], old_dict['_id'])

    loader = RecipeItemLoader(RecipeItem())
    for k, v in source_dict.iteritems():
        loader = set_value(loader, k, v)

    new_item = loader.load_item()
    return new_item, source_dict
    def parse_item(self, response):

        hxs = HtmlXPathSelector(response)
        data = {"url": response.url, "source": self.source}
        recipe = RecipeItem.from_dict(parse_recipe(hxs, data))
        loader = RecipeItemLoader(item=recipe)
        loader.add_value("image", select_class(hxs, "post_image").select("@src").extract())
        loader.add_value("description", hxs.select('//meta[@name="description"]/@content').extract())
        loader.add_value("name", select_class(hxs, "entry-title").select("text()").extract())
        return [loader.load_item()]
    def parse_item(self, response):

        hxs = HtmlXPathSelector(response)
        data = {'url': response.url, 'source': self.source}
        recipe = RecipeItem.from_dict(parse_recipe(hxs, data))
        loader = RecipeItemLoader(item=recipe)
        loader.add_value('image', select_class(hxs, 'post_image').select('@src').extract())
        loader.add_value('description', hxs.select('//meta[@name="description"]/@content').extract())
        loader.add_value('name', select_class(hxs, 'entry-title').select('text()').extract())
        return loader.load_item()
示例#4
0
def clean_item(old_dict):
    # copy this so we have an unmodified version
    source_dict = dict(old_dict)
    # remove ts and _id fields from what we pass to loader
    del source_dict['ts']
    del source_dict['_id']

    if VERBOSE:
        print "Examining '%s' from '%s' (%s)..." % (old_dict['name'],
                                                    old_dict['source'],
                                                    old_dict['_id'])

    loader = RecipeItemLoader(RecipeItem())
    for k, v in source_dict.iteritems():
        loader = set_value(loader, k, v)

    new_item = loader.load_item()
    return new_item, source_dict
示例#5
0
    def parse_item(self, response):

        hxs = HtmlXPathSelector(response)
        data = {'url': response.url, 'source': self.source}
        recipe = RecipeItem.from_dict(parse_recipe(hxs, data))
        loader = RecipeItemLoader(item=recipe)
        loader.add_value(
            'image',
            select_class(hxs, 'post_image').select('@src').extract())
        loader.add_value(
            'description',
            hxs.select('//meta[@name="description"]/@content').extract())
        loader.add_value(
            'name',
            select_class(hxs, 'entry-title').select('text()').extract())
        return [loader.load_item()]
    def parse_item(self, response):

        hxs = HtmlXPathSelector(response)

        base_path = '//blockquote[@class="recipe hrecipe"]'

        recipes_scopes = hxs.select(base_path)

        name_path = '//*[@class="fn"]/text()'
        description_path = '//*[@class="summary"]/p/text()'
        image_path = '//img[@class="photo"]/@src'
        prepTime_path = '//*[@class="preptime"]/text()'
        cookTime_path = '//*[@class="cooktime"]/text()'
        recipeYield_path = '//*[@class="yield"]/text()'
        ingredients_path = '//*[@class="ingredient"]/p/text()'

        recipes = []

        for r_scope in recipes_scopes:
            il = RecipeItemLoader(item=RecipeItem())

            il.add_value('source', self.source)

            il.add_value('name', r_scope.select(name_path).extract())
            il.add_value('image', r_scope.select(image_path).extract())
            il.add_value('url', response.url)
            il.add_value('description',
                         r_scope.select(description_path).extract())

            il.add_value('prepTime', r_scope.select(prepTime_path).extract())
            il.add_value('cookTime', r_scope.select(cookTime_path).extract())
            il.add_value('recipeYield',
                         r_scope.select(recipeYield_path).extract())

            il.add_value('ingredients',
                         r_scope.select(ingredients_path).extract())

            recipes.append(il.load_item())

        return recipes
    def parse_item(self, response):

        hxs = HtmlXPathSelector(response)

        base_path = '//div[@class="innerrecipe"]'

        recipes_scopes = hxs.select(base_path)

        name_path = '*//h2[@class="fn"]/text()'
        image_path = '*//img[@class="photo"]/@src'
        prepTime_path = '*//span[@class="preptime"]/text()'
        cookTime_path = '*//span[@class="cooktime"]/text()'
        totalTime_path = '*//span[@class="duration"]/text()'
        recipeYield_path = '*//span[@class="yield"]/text()'
        datePublished = '//div[@class="post fullpost singlepost"]//div[@class="postmeta"]/text()[normalize-space()]'
        ingredients_path = '*//*[@class="ingredient"]/p'

        recipes = []

        for r_scope in recipes_scopes:
            il = RecipeItemLoader(item=RecipeItem())

            il.add_value("source", self.source)

            il.add_value("name", r_scope.select(name_path).extract())
            il.add_value("image", r_scope.select(image_path).extract())
            il.add_value("url", response.url)

            il.add_value("prepTime", r_scope.select(prepTime_path).extract())
            il.add_value("cookTime", r_scope.select(cookTime_path).extract())
            il.add_value("totalTime", r_scope.select(totalTime_path).extract())
            il.add_value("recipeYield", r_scope.select(recipeYield_path).extract())

            ingredient_scopes = r_scope.select(ingredients_path)
            ingredients = []
            for i_scope in ingredient_scopes:
                ingredient = i_scope.select("text()").extract()
                ingredient = "".join(ingredient)
                ingredients.append(ingredient)
            il.add_value("ingredients", ingredients)

            il.add_value("datePublished", r_scope.select(datePublished).extract())

            recipes.append(il.load_item())

        return recipes
    def parse_item(self, response):

        hxs = HtmlXPathSelector(response)
        base_path = """//*[@class="recipe"]"""
        recipes_scope = hxs.select(base_path)

        name_path = '//meta[@property="og:title"]/@content'
        url_path = '//meta[@property="og:url"]/@content'
        image_path = '//meta[@property="og:image"]/@content'
        recipeYield_path = './p[1]/text()'
        ingredients_path = './p[1][br]|./p[2][br]'
        datePublished = '//time[@class="entry-date"]/@datetime'

        recipes = []

        for r_scope in recipes_scope:

            il = RecipeItemLoader(item=RecipeItem())

            il.add_value('source', self.source)

            il.add_value('name', r_scope.select(name_path).extract())
            il.add_value('image', r_scope.select(image_path).extract())
            il.add_value('url', r_scope.select(url_path).extract())

            recipe_yield = r_scope.select(recipeYield_path).extract()
            # Add values if `extract()` string contains "serves"
            if "".join(recipe_yield).find('serves') >= 0:
                il.add_value('recipeYield', "".join(recipe_yield))

            ingredients_scope = r_scope.select(ingredients_path)
            ingredients = []
            for i_scope in ingredients_scope:
                ingredient = i_scope.select('./text()').extract()
                ingredients.append(ingredient)
            il.add_value('ingredients', ingredients)

            il.add_value('datePublished', r_scope.select(datePublished).extract())

            recipes.append(il.load_item())

        return recipes
    def parse_item(self, response):

        # we use this to run XPath commands against the HTML in the response
        hxs = HtmlXPathSelector(response)

        # this is the base XPath string for the element that contains the
        # recipe info
        base_path = """//div[@itemtype="http://schema.org/Recipe"]"""

        # the select() method will return a list of HtmlXPathSelector objects.
        # On this site we will almost certainly either get back just one, if
        # any exist on the page
        recipes_scopes = hxs.select(base_path)

        # it's easier to define these XPath strings outside of the loop below
        name_path = '//div[@itemprop="name"]/text() | //*[@itemprop="name"]//*[@class="fn"]/text()'
        description_path = '//div[@itemprop="description"]/text()'
        image_path = '//img[1]/@src'
        prepTime_path = '//time[@itemprop="prepTime"][contains(@datetime, "PT")]/@datetime | //time[@itemprop="prepTime"]//*[@class="value-title"]/@title'
        cookTime_path = '//time[@itemprop="cookTime"][contains(@datetime, "PT")]/@datetime | //time[@itemprop="cookTime"]//*[@class="value-title"]/@title'
        recipeYield_path = '//span[@itemprop="recipeYield"]/text()'
        ingredients_path = '//li[@itemprop="ingredients"]/text()'
        datePublished = '//abbr[@class="published"]/text()'

        # init an empty list
        recipes = []

        # loop through our recipe scopes and extract the recipe data from each
        for r_scope in recipes_scopes:
            # make an empty RecipeItem
            il = RecipeItemLoader(item=RecipeItem())

            il.add_value('source', self.source)

            il.add_value('name', r_scope.select(name_path).extract())

            # There's a bunch of images for each recipe, so we just
            # grab the first.
            il.add_value('image', r_scope.select(image_path).extract()[1])
            il.add_value('url', response.url)
            il.add_value('description',
                         r_scope.select(description_path).extract())
            il.add_value('prepTime', r_scope.select(prepTime_path).extract())
            il.add_value('cookTime', r_scope.select(cookTime_path).extract())
            il.add_value('recipeYield',
                         r_scope.select(recipeYield_path).extract())
            il.add_value('ingredients',
                         r_scope.select(ingredients_path).extract())
            il.add_value('datePublished',
                         r_scope.select(datePublished).extract())

            # stick this RecipeItem in the array of recipes we will return
            recipes.append(il.load_item())

        # more processing is done by the openrecipes.pipelines. Look at that
        # file to see transforms that are applied to each RecipeItem
        return recipes
    def parse_item(self, response):

        hxs = HtmlXPathSelector(response)

        base_path = '//*[@class="recipe hrecipe"]'

        recipes_scopes = hxs.select(base_path)

        name_path = './/*[@class="fn"]/text()'
        #desription is pretty odd on this site.
        #description_path = 'TODO'
        image_path = '//div/p[1]//img/@src'
        prepTime_path = '//*[@class="preptime"]/text()'
        cookTime_path = '//*[@class="cooktime"]/text()'
        recipeYield_path = '//*[@class="yield"]/text()'
        ingredients_path = './/div[@class="ingredient"]/p/text()'
        #the formatting is odd, will need to learn more xpath to be able to select, just date
        datePublished = '//*[@class="postmeta"]/text()'

        recipes = []

        label_regex = re.compile(r'^For ')

        for r_scope in recipes_scopes:
            il = RecipeItemLoader(item=RecipeItem())

            il.add_value('source', self.source)

            il.add_value('name', r_scope.select(name_path).extract())
            il.add_value('image', r_scope.select(image_path).extract())
            il.add_value('url', response.url)
            #il.add_value('description', r_scope.select(description_path).extract())

            il.add_value('prepTime', r_scope.select(prepTime_path).extract())
            il.add_value('cookTime', r_scope.select(cookTime_path).extract())
            il.add_value('recipeYield',
                         r_scope.select(recipeYield_path).extract())

            ingredient_scopes = r_scope.select(ingredients_path)
            ingredients = []
            for i_scope in ingredient_scopes:
                ingredient = i_scope.extract().strip()
                if not label_regex.match(
                        ingredient) and not ingredient.endswith(':'):
                    ingredients.append(ingredient)
            il.add_value('ingredients', ingredients)

            il.add_value('datePublished',
                         r_scope.select(datePublished).extract())

            recipes.append(il.load_item())

        return recipes
示例#11
0
    def parse_item(self, response):

        hxs = HtmlXPathSelector(response)

        base_path = '//div[@id="content"]'

        recipes_scopes = hxs.select(base_path)

        name_path = './/span[@class="item"]/h2[@class="fn"]/text()'
        image_path = "descendant-or-self::img[@class and contains(concat(' ', normalize-space(@class), ' '), ' size-full ')][1]/@src"
        prepTime_path = './/span[@class="preptime"]/text()'
        cookTime_path = './/span[@class="cooktime"]/text()'
        recipeYield_path = './/span[@class="yield"]/text()'
        ingredients_path = './/div[@class="ingredient"]/p/text()'

        recipes = []

        label_regex = re.compile(r'^For ')

        for r_scope in recipes_scopes:
            il = RecipeItemLoader(item=RecipeItem())

            il.add_value('source', self.source)

            il.add_value('name', r_scope.select(name_path).extract())
            il.add_value('image', r_scope.select(image_path).extract())
            il.add_value('url', response.url)

            il.add_value('prepTime', r_scope.select(prepTime_path).extract())
            il.add_value('cookTime', r_scope.select(cookTime_path).extract())
            il.add_value('recipeYield',
                         r_scope.select(recipeYield_path).extract())

            ingredient_scopes = r_scope.select(ingredients_path)
            ingredients = []
            for i_scope in ingredient_scopes:
                ingredient = i_scope.extract().strip()
                if not label_regex.match(
                        ingredient) and not ingredient.endswith(':'):
                    ingredients.append(ingredient)
            il.add_value('ingredients', ingredients)

            recipes.append(il.load_item())

        return recipes
示例#12
0
    def parse_item(self, response):

        hxs = HtmlXPathSelector(response)
        image_path = hxs.select("descendant-or-self::img[@class and contains(@class, 'wp-image')][1]/@data-lazy-src").extract()

        raw_recipes = parse_recipes(hxs, {'source': self.source, 'url': response.url})
        if raw_recipes:
            # schema.org.  Yay!
            for recipe in raw_recipes:
                recipe['image'] = image_path

            return [RecipeItem.from_dict(recipe) for recipe in raw_recipes]
        else:
            # not schema.org.  Boo!
            il = RecipeItemLoader(item=RecipeItem())

            il.add_value('source', self.source)
            il.add_value('url', response.url)
            il.add_value('image', image_path)

            name_path = '//*[@class="post-title"]/h1/text()'
            il.add_value('name', hxs.select(name_path).extract())
            # maybe it's in the P's
            for p in hxs.select('//div[@id="recipe" or @class="span9"]/p'):
                if is_ingredient_container(p):
                    il.add_value('ingredients', p.select('text()').extract())
            # or maybe it's in the LI's
            for li in hxs.select('//*[@class="span9"]//ul/li'):
                if is_ingredient_container(li):
                    il.add_value('ingredients', li.select('text()').extract())
            # or maybe it's in these other LI's
            for li in hxs.select('//li[@class="ingredient"]/text()'):
                il.add_value('ingredients', li.extract())
            return il.load_item()
    def parse_item(self, response):

        # we use this to run XPath commands against the HTML in the response
        hxs = HtmlXPathSelector(response)

        # this is the base XPath string for the element that contains the recipe
        # info
        base_path = """//div[@itemtype="http://data-vocabulary.org/Recipe"]"""

        # the select() method will return a list of HtmlXPathSelector objects.
        # On this site we will almost certainly either get back just one, if
        # any exist on the page
        recipes_scopes = hxs.select(base_path)

        # it's easier to define these XPath strings outside of the loop below
        name_path = '//meta[@property="og:title"]/@content'
        description_path = '//meta[@property="og:description"]/@content'
        url_path = '//meta[@property="og:url"]/@content'
        image_path = '//meta[@property="og:image"][1]/@content'
        prepTime_path = '*//*[@itemprop="prepTime"]/@datetime'
        cookTime_path = '*//*[@itemprop="cookTime"]/@datetime'
        recipeYield_path = '*//*[@itemprop="yield"]/text()'
        ingredients_path = '*//*[@itemprop="ingredient"]'
        datePublished = '*/*[@itemprop="published"]/@datetime'

        # init an empty list
        recipes = []

        # loop through our recipe scopes and extract the recipe data from each
        for r_scope in recipes_scopes:
            # make an empty RecipeItemLoader
            il = RecipeItemLoader(item=RecipeItem())

            il.add_value('source', self.source)

            il.add_value('name', r_scope.select(name_path).extract())
            il.add_value('image', r_scope.select(image_path).extract())
            il.add_value('url', r_scope.select(url_path).extract())
            il.add_value('description', r_scope.select(description_path).extract())

            il.add_value('prepTime', r_scope.select(prepTime_path).extract())
            il.add_value('cookTime', r_scope.select(cookTime_path).extract())
            il.add_value('recipeYield', r_scope.select(recipeYield_path).extract())

            # ingredients require more work on this site to extract. We first
            # get the base elements, and then loop through to pull out each
            # "amount" and "name." Then we build a single string to represent
            # each one and append it to the array of ingredients
            ingredient_scopes = r_scope.select(ingredients_path)
            ingredients = []
            for i_scope in ingredient_scopes:
                amount = i_scope.select('*[@itemprop="amount"]/text()').extract()
                name = i_scope.select('*[@itemprop="name"]/text()').extract()
                amount = "".join(amount).strip()
                name = "".join(name).strip()
                ingredients.append("%s %s" % (amount, name))
            il.add_value('ingredients', ingredients)

            il.add_value('datePublished', r_scope.select(datePublished).extract())

            # il.load_item() returns a RecipeItem passed through the
            # RecipeItemLoader's property formatters. Apppend the RecipeItem
            # to the recipes list
            recipes.append(il.load_item())

        # more processing is done by the openrecipes.pipelines. Look at that
        # file to see transforms that are applied to each RecipeItem
        return recipes
    def parse_item(self, response):

        hxs = HtmlXPathSelector(response)

        base_path = '//*[@class="recipe hrecipe"]'

        recipes_scopes = hxs.select(base_path)

        name_path = './/*[@class="fn"]/text()'
        #desription is pretty odd on this site.
        #description_path = 'TODO'
        image_path = '//div/p[1]//img/@src'
        prepTime_path = '//*[@class="preptime"]/text()'
        cookTime_path = '//*[@class="cooktime"]/text()'
        recipeYield_path = '//*[@class="yield"]/text()'
        ingredients_path = './/div[@class="ingredient"]/p/text()'
        #the formatting is odd, will need to learn more xpath to be able to select, just date
        datePublished = '//*[@class="postmeta"]/text()'

        recipes = []

        label_regex = re.compile(r'^For ')

        for r_scope in recipes_scopes:
            il = RecipeItemLoader(item=RecipeItem())

            il.add_value('source', self.source)

            il.add_value('name', r_scope.select(name_path).extract())
            il.add_value('image', r_scope.select(image_path).extract())
            il.add_value('url', response.url)
            #il.add_value('description', r_scope.select(description_path).extract())

            il.add_value('prepTime', r_scope.select(prepTime_path).extract())
            il.add_value('cookTime', r_scope.select(cookTime_path).extract())
            il.add_value('recipeYield', r_scope.select(recipeYield_path).extract())

            ingredient_scopes = r_scope.select(ingredients_path)
            ingredients = []
            for i_scope in ingredient_scopes:
                ingredient = i_scope.extract().strip()
                if not label_regex.match(ingredient) and not ingredient.endswith(':'):
                    ingredients.append(ingredient)
            il.add_value('ingredients', ingredients)

            il.add_value('datePublished', r_scope.select(datePublished).extract())

            recipes.append(il.load_item())

        return recipes
示例#15
0
    def parse_item(self, response):
        hxs = HtmlXPathSelector(response)

        # site has many recipes missing the semantmic markup, but not worth
        # pursuing those IMHO. use hrecipe
        base_path = """//*[@class="hrecipe"]"""

        recipes_scopes = hxs.select(base_path)

        name_path = './/*[@class="fn"]/text()'
        url_path = '//meta[@property="og:url"]/@content'
        image_path = '//meta[@property="og:image"][1]/@content'
        recipeYield_path = './/*[@class="yield"]/text()'
        ingredients_path = '*//*[@class="ingredient"]'

        # get the date from rest of page, not under hrecipe
        datePublished_path = '//*[@class="date"][1]'

        recipes = []

        for r_scope in recipes_scopes:
            il = RecipeItemLoader(item=RecipeItem())

            il.add_value('source', self.source)

            il.add_value('name', r_scope.select(name_path).extract())
            il.add_value('image', r_scope.select(image_path).extract())
            il.add_value('url', r_scope.select(url_path).extract())

            il.add_value('recipeYield', r_scope.select(recipeYield_path).extract())

            ingredient_scopes = r_scope.select(ingredients_path)
            ingredients = []
            for i_scope in ingredient_scopes:
                amount = i_scope.select('*[@class="amount"]/text()').extract()
                name = i_scope.select('*[@class="name"]/text()').extract()
                amount = "".join(amount).strip()
                name = "".join(name).strip()
                ingredients.append("%s %s" % (amount, name))
            il.add_value('ingredients', ingredients)

            il.add_value('datePublished', r_scope.select(datePublished_path).extract())

            recipes.append(il.load_item())

        return recipes
示例#16
0
    def parse_item_alt1(self, response):
        hxs = HtmlXPathSelector(response)
        base_path = """//blockquote"""
        recipes_scopes = hxs.select(base_path)

        # it's easier to define these XPath strings outside of the loop below
        name_path = '//meta[@property="og:title"]/@content'
        description_path = '//meta[@property="og:description"]/@content'
        url_path = '//meta[@property="og:url"]/@content'
        # just grab the first image we can find
        image_path = '//div[@class="post"]/p[1]/img/@src'
        ypc_path = './/p/text()[starts-with(.,"Yields")]'
        # ingredients always seems to follow the ypc block
        ingredients_path = './/p[starts-with(text(),"Yields")]/following-sibling::p[1]'
        datePublished_path = '//meta[@property="article:published_time"]/@content'
        dateModified_path = '//meta[@property="article:modified_time"]/@content'

        # init an empty list
        recipes = []

        # loop through our recipe scopes and extract the recipe data from each
        for r_scope in recipes_scopes:
            # make an empty RecipeItem
            il = RecipeItemLoader(item=RecipeItem())

            il.add_value('source', self.source)

            il.add_value('name', r_scope.select(name_path).extract())
            il.add_value('image', r_scope.select(image_path).extract())
            il.add_value('url', r_scope.select(url_path).extract())
            il.add_value('description',
                         r_scope.select(description_path).extract())

            # might be able to make this bit more robust, which would probably
            # let us hit more recipes on this site. Not terribly motivated, tho
            ypc_str = "".join(r_scope.select(ypc_path).extract())
            yield_match = re.match(r'Yields?:?\s([^|]+)', ypc_str, re.I)
            prep_match = re.match(r'.+Prep(?: Time)?:?\s([^|]+)', ypc_str,
                                  re.I)
            cook_match = re.match(r'.+Cook Time:?\s([^|]+)', ypc_str, re.I)

            if yield_match:
                il.add_value('recipeYield', yield_match.group(1))
            if prep_match:
                il.add_value('prepTime', prep_match.group(1))
            if cook_match:
                il.add_value('cookTime', cook_match.group(1))

            il.add_value('ingredients',
                         r_scope.select(ingredients_path).extract())

            il.add_value('datePublished',
                         r_scope.select(datePublished_path).extract())
            il.add_value('dateModified',
                         r_scope.select(dateModified_path).extract())

            # stick this RecipeItem in the array of recipes we will return
            recipes.append(il.load_item())

        # more processing is done by the openrecipes.pipelines. Look at that
        # file to see transforms that are applied to each RecipeItem
        return recipes
示例#17
0
    def parse_item(self, response):
        """
        this site is a mess, with LOTS of inconsistencies in formatting. We will
        try one other approach (in parse_item_alt1), but spending a bunch of
        time to get their old recipes seems like a waste.
        """

        # we use this to run XPath commands against the HTML in the response
        hxs = HtmlXPathSelector(response)

        # this is the base XPath string for the element that contains the recipe
        # info
        base_path = """//blockquote[@class="recipe hrecipe"]"""

        # the select() method will return a list of HtmlXPathSelector objects.
        # On this site we will almost certainly either get back just one, if
        # any exist on the page
        recipes_scopes = hxs.select(base_path)

        # if we don't find anything, try the alt parser
        if len(recipes_scopes) < 1:
            self.log('calling alternate delishhh.com scraper')
            return self.parse_item_alt1(response)

        name_path = '//meta[@property="og:title"]/@content'
        description_path = '//meta[@property="og:description"]/@content'
        url_path = '//meta[@property="og:url"]/@content'
        image_path = '//meta[@property="og:image"][1]/@content'
        prepTime_path = './/*[@class="preptime"]/text()'
        cookTime_path = './/*[@class="cooktime"]/text()'
        recipeYield_path = './/*[@class="yield"]/text()'
        ingredients_path = './/div[@class="ingredient"]/*'
        datePublished_path = '//meta[@property="article:published_time"]/@content'
        dateModified_path = '//meta[@property="article:modified_time"]/@content'
        # init an empty list
        recipes = []

        # loop through our recipe scopes and extract the recipe data from each
        for r_scope in recipes_scopes:
            # make an empty RecipeItem
            il = RecipeItemLoader(item=RecipeItem())

            il.add_value('source', self.source)

            il.add_value('name', r_scope.select(name_path).extract())
            il.add_value('image', r_scope.select(image_path).extract())
            il.add_value('url', r_scope.select(url_path).extract())
            il.add_value('description',
                         r_scope.select(description_path).extract())

            il.add_value('prepTime', r_scope.select(prepTime_path).extract())
            il.add_value('cookTime', r_scope.select(cookTime_path).extract())
            il.add_value('recipeYield',
                         r_scope.select(recipeYield_path).extract())

            il.add_value('ingredients',
                         r_scope.select(ingredients_path).extract())

            il.add_value('datePublished',
                         r_scope.select(datePublished_path).extract())
            il.add_value('dateModified',
                         r_scope.select(dateModified_path).extract())

            # stick this RecipeItem in the array of recipes we will return
            recipes.append(il.load_item())

        # more processing is done by the openrecipes.pipelines. Look at that
        # file to see transforms that are applied to each RecipeItem
        return recipes
示例#18
0
    def parse_item(self, response):

        # we use this to run XPath commands against the HTML in the response
        hxs = HtmlXPathSelector(response)

        # this is the base XPath string for the element that contains the recipe
        # info
        base_path = """//span[@class="hrecipe"]"""

        # the select() method will return a list of HtmlXPathSelector objects.
        # On this site we will almost certainly either get back just one, if
        # any exist on the page
        recipes_scopes = hxs.select(base_path)

        # it's easier to define these XPath strings outside of the loop below
        name_path = '//div[@class="content"]/header/h1[@class="fn"]/text()'
        description_path = '//article[@class="recipe_description"]//text()'
        image_path = '//div[@class="recipe_image_main"]/p/img/@src'
        recipeYield_path = '//div[@class="recipe_meta"]/p/span[contains(@class,"yield")]/text()'
        ingredients_path = '//article[@class="ingredients"]//ul//li/p[@class="ingredient"]/span[@class="value"]/text()'

        # init an empty list
        recipes = []

        # loop through our recipe scopes and extract the recipe data from each
        for r_scope in recipes_scopes:
            # make an empty RecipeItem
            il = RecipeItemLoader(item=RecipeItem())

            il.add_value('source', self.source)

            il.add_value('name', r_scope.select(name_path).extract())
            il.add_value('image', urljoin(response.url, r_scope.select(image_path).extract().pop(0)))
            il.add_value('url', response.url)
            il.add_value('description', r_scope.select(description_path).extract())

            # prepTime not available
            il.add_value('prepTime', None)
            # cookTime not available
            il.add_value('cookTime', None)
            il.add_value('recipeYield', r_scope.select(recipeYield_path).extract())
            il.add_value('ingredients', r_scope.select(ingredients_path).extract())

            # datePublished not available
            il.add_value('datePublished', None)

            # stick this RecipeItem in the array of recipes we will return
            recipes.append(il.load_item())

        # more processing is done by the openrecipes.pipelines. Look at that
        # file to see transforms that are applied to each RecipeItem
        return recipes
示例#19
0
    def parse_item(self, response):
        hxs = HtmlXPathSelector(response)

        base_path = '//*[@itemtype="http://schema.org/Recipe"]'

        recipes_scopes = hxs.select(base_path)

        name_path = '//*[@itemprop="name"]/text()'
        description_path = '//*[@itemprop="description"]/text()'
        url_path = '//meta[@property="og:url"]/@content'
        image_path = '//*[@itemprop="image"]/@src'
        recipeYield_path = '*//*[@itemprop="recipeYield"]/text()'

        prepTime_path = '//*[@itemprop="prepTime"]'
        cookTime_path = '//*[@itemprop="cookTime"]'

        ingredients_path = '//*[@itemprop="ingredients"]'

        recipes = []
        for r_scope in recipes_scopes:
            il = RecipeItemLoader(item=RecipeItem())
            il.add_value('source', 'allrecipes')
            il.add_value('name', r_scope.select(name_path).extract())
            il.add_value('image', r_scope.select(image_path).extract())
            il.add_value('url', r_scope.select(url_path).extract())
            il.add_value('description',
                         r_scope.select(description_path).extract())

            prepTime = r_scope.select(prepTime_path)
            il.add_value('prepTime', parse_iso_date(prepTime))

            cookTime = r_scope.select(cookTime_path)
            il.add_value('cookTime', parse_iso_date(cookTime))
            il.add_value('recipeYield',
                         r_scope.select(recipeYield_path).extract())

            ingredient_scopes = r_scope.select(ingredients_path)
            ingredients = []
            for i_scope in ingredient_scopes:
                components = i_scope.select('node()/text()').extract()
                ingredients.append(' '.join(components))

            il.add_value('ingredients', ingredients)

            recipes.append(il.load_item())

        return recipes
示例#20
0
    def parse_item(self, response):
        hxs = HtmlXPathSelector(response)

        # blockquote containing the recipe has multiple classes
        # specify that it must contain the class hrecipe
        base_path = """//blockquote[contains(concat(' ', normalize-space(@class), ' '), ' hrecipe ')]"""

        recipes_scopes = hxs.select(base_path)

        name_path = '//*[@class="fn"]/text()'
        description_path = '//*[@class="summary"]/p/text()'
        image_path = '//img[@class="photo"]/@src'
        prepTime_path = '//*[@class="preptime"]/text()'
        cookTime_path = '//*[@class="cooktime"]/text()'
        totalTime_path = '//*[@class="duration"]/text()'
        recipeYield_path = '//*[@class="yield"]/text()'
        ingredients_path = '//*[@class="ingredient"]/p/text() | //*[@class="ingredient"]/span/text()'

        recipes = []

        for r_scope in recipes_scopes:
            il = RecipeItemLoader(item=RecipeItem())
            il.add_value('source', self.source)
            il.add_value('name', r_scope.select(name_path).extract())
            il.add_value('url', response.url)
            il.add_value('image', r_scope.select(image_path).extract())
            il.add_value('description',
                         r_scope.select(description_path).extract())
            il.add_value('prepTime', r_scope.select(prepTime_path).extract())
            il.add_value('cookTime', r_scope.select(cookTime_path).extract())
            il.add_value('totalTime', r_scope.select(totalTime_path).extract())
            il.add_value('recipeYield',
                         r_scope.select(recipeYield_path).extract())
            il.add_value('ingredients',
                         r_scope.select(ingredients_path).extract())
            recipes.append(il.load_item())

        return recipes
示例#21
0
    def parse_item(self, response):

        hxs = HtmlXPathSelector(response)

        base_path = """//*[contains(concat(' ', normalize-space(@class), ' '),
                        ' hrecipe ')]"""
        recipes_scopes = hxs.select(base_path)

        name_path = '//*[@class="fn"]/text()'
        description_path = '//*[@class="recipe-description summary"]/p/text()'
        image_path = '//img[@class="photo"]/@src'
        recipeYield_path = '//*[@class="directions"]/p/text()'
        ingredients_path = '//*[@class="ingredient"]/text()'

        recipes = []

        for r_scope in recipes_scopes:
            il = RecipeItemLoader(item=RecipeItem())

            il.add_value('source', self.source)

            il.add_value('name', r_scope.select(name_path).extract())
            il.add_value('image', r_scope.select(image_path).extract())
            il.add_value('url', response.url)
            il.add_value('description',
                         r_scope.select(description_path).extract())

            # yield given somewhere in description 'Serves n.'
            il.add_value('recipeYield',
                         r_scope.select(recipeYield_path).re('Serves \d\.'))

            il.add_value('ingredients',
                         r_scope.select(ingredients_path).extract())

            recipes.append(il.load_item())

        return recipes
示例#22
0
    def parse_item(self, response):

        # we use this to run XPath commands against the HTML in the response
        hxs = HtmlXPathSelector(response)

        # this is the base XPath string for the element that contains the recipe
        # info
        base_path = """//*[@id="container"]/*[@class="onepage"]/div[1]/div[@class="content"]"""

        # the select() method will return a list of HtmlXPathSelector objects.
        # On this site we will almost certainly either get back just one, if
        # any exist on the page
        recipes_scopes = hxs.select(base_path)

        # it's easier to define these XPath strings outside of the loop below
        name_path = '//h1[@class="title"]/text() | //*[@class="content"]/p[@style="text-align: center;"]/following-sibling::p[strong]/strong/text()'
        image_path = '//*[@class="content"]/p[1]/img[contains(@class, "size-full")]/@src'
        recipeYield_path = '//*[@class="content"]/p[@style="text-align: center;"]/following-sibling::p[em and strong]/em/text()'
        datePublished = '//*[@class="phn-date"]/a[@rel="author"]/following-sibling::text()'

        # This site contains Ingredients and Garnishes, both "lists" are inside a <p> and separated
        # using <br>s. Also, we skip the <p> containing "EVENT VENUE PARTY SIZE TYPE MENU" by
        # grabbing <p>s that do not have <strong>, <a>, or <img> child elements
        ingredients_path = '//*[@class="content"]/p[not(strong or a or img) and br]/text()'

        # init an empty list
        recipes = []

        # loop through our recipe scopes and extract the recipe data from each
        for r_scope in recipes_scopes:
            # make an empty RecipeItem
            il = RecipeItemLoader(item=RecipeItem())

            il.add_value('source', self.source)

            il.add_value('name', r_scope.select(name_path).extract())
            il.add_value('image', r_scope.select(image_path).extract())
            il.add_value('url', response.url)

            il.add_value('recipeYield', r_scope.select(recipeYield_path).extract())

            # date returns something like this: "ON SATURDAY NOV 28TH, 2009 |"
            date = r_scope.select(datePublished).extract()
            if len(date) > 0:
                date = date[0].replace('on', '', 1).replace('|', '').strip()
                il.add_value('datePublished', date)

            il.add_value('ingredients', r_scope.select(ingredients_path).extract())

            # stick this RecipeItem in the array of recipes we will return
            recipes.append(il.load_item())

        # more processing is done by the openrecipes.pipelines. Look at that
        # file to see transforms that are applied to each RecipeItem
        return recipes
    def parse_item(self, response):

        # we use this to run XPath commands against the HTML in the response
        hxs = HtmlXPathSelector(response)

        # this is the base XPath string for the element that contains the recipe
        # info
        base_path = """//div[@itemtype="http://data-vocabulary.org/Recipe"]"""

        # the select() method will return a list of HtmlXPathSelector objects.
        # On this site we will almost certainly either get back just one, if
        # any exist on the page
        recipes_scopes = hxs.select(base_path)

        # it's easier to define these XPath strings outside of the loop below
        name_path = '//meta[@property="og:title"]/@content'
        description_path = '//meta[@property="og:description"]/@content'
        url_path = '//meta[@property="og:url"]/@content'
        image_path = '//meta[@property="og:image"][1]/@content'
        prepTime_path = '*//*[@itemprop="prepTime"]/@datetime'
        cookTime_path = '*//*[@itemprop="cookTime"]/@datetime'
        recipeYield_path = '*//*[@itemprop="yield"]/text()'
        ingredients_path = '*//*[@itemprop="ingredient"]'
        datePublished = '*/*[@itemprop="published"]/@datetime'

        # init an empty list
        recipes = []

        # loop through our recipe scopes and extract the recipe data from each
        for r_scope in recipes_scopes:
            # make an empty RecipeItemLoader
            il = RecipeItemLoader(item=RecipeItem())

            il.add_value('source', self.source)

            il.add_value('name', r_scope.select(name_path).extract())
            il.add_value('image', r_scope.select(image_path).extract())
            il.add_value('url', r_scope.select(url_path).extract())
            il.add_value('description',
                         r_scope.select(description_path).extract())

            il.add_value('prepTime', r_scope.select(prepTime_path).extract())
            il.add_value('cookTime', r_scope.select(cookTime_path).extract())
            il.add_value('recipeYield',
                         r_scope.select(recipeYield_path).extract())

            # ingredients require more work on this site to extract. We first
            # get the base elements, and then loop through to pull out each
            # "amount" and "name." Then we build a single string to represent
            # each one and append it to the array of ingredients
            ingredient_scopes = r_scope.select(ingredients_path)
            ingredients = []
            for i_scope in ingredient_scopes:
                amount = i_scope.select(
                    '*[@itemprop="amount"]/text()').extract()
                name = i_scope.select('*[@itemprop="name"]/text()').extract()
                amount = "".join(amount).strip()
                name = "".join(name).strip()
                ingredients.append("%s %s" % (amount, name))
            il.add_value('ingredients', ingredients)

            il.add_value('datePublished',
                         r_scope.select(datePublished).extract())

            # il.load_item() returns a RecipeItem passed through the
            # RecipeItemLoader's property formatters. Apppend the RecipeItem
            # to the recipes list
            recipes.append(il.load_item())

        # more processing is done by the openrecipes.pipelines. Look at that
        # file to see transforms that are applied to each RecipeItem
        return recipes
示例#24
0
    def parse_item(self, response):

        # we use this to run XPath commands against the HTML in the response
        hxs = HtmlXPathSelector(response)

        # this is the base XPath string for the element that contains the recipe
        # info
        base_path = """//div[@class="recipe-details"]"""

        # the select() method will return a list of HtmlXPathSelector objects.
        # On this site we will almost certainly either get back just one, if
        # any exist on the page
        recipes_scopes = hxs.select(base_path)

        # it's easier to define these XPath strings outside of the loop below
        name_path = '//h1[@itemprop="name"]/text()'
        recipeYield_path = '//label[@for="set_servings"]/input/@value'
        description_path = '//span[@itemprop="summary"]/p/text()'
        image_path = '//img[@class="the_recipe_image"]/@src'
        cookTime_path = '//form/p/time[@itemprop="cookTime"]/@datetime'
        prepTime_path = '//form/p/time[@itemprop="prepTime"]/@datetime'
        ingredients_path = '//span[@itemprop="ingredient"]'
        ingredients_amounts_path = './span[@itemprop="amount"]/text()'
        ingredients_names_path = './span[@itemprop="name"]/text()'
        datePublished_path = '//span[@itemprop="published"]/@datetime'

        # init an empty list
        recipes = []

        # loop through our recipe scopes and extract the recipe data from each
        for r_scope in recipes_scopes:
            # make an empty RecipeItem
            il = RecipeItemLoader(item=RecipeItem())

            il.add_value('source', self.source)

            il.add_value('name', r_scope.select(name_path).extract())
            il.add_value('image', r_scope.select(image_path).extract())
            il.add_value('description',
                         r_scope.select(description_path).extract())
            il.add_value('url', response.url)
            il.add_value('prepTime', r_scope.select(prepTime_path).extract())
            il.add_value('cookTime', r_scope.select(cookTime_path).extract())
            il.add_value('recipeYield',
                         r_scope.select(recipeYield_path).extract())
            il.add_value('datePublished',
                         r_scope.select(datePublished_path).extract())

            # Simpler to grab the amount and name spans separately,
            # then combine them into a string.
            ingredient_scopes = r_scope.select(ingredients_path)
            amount = ingredient_scopes.select(
                ingredients_amounts_path).extract()
            name = ingredient_scopes.select(ingredients_names_path).extract()
            ingredients = [
                " ".join(ing).encode('utf-8') for ing in zip(amount, name)
            ]

            il.add_value('ingredients', ingredients)

            # stick this RecipeItem in the array of recipes we will return
            recipes.append(il.load_item())

        # more processing is done by the openrecipes.pipelines. Look at that
        # file to see transforms that are applied to each RecipeItem
        return recipes
示例#25
0
    def parse_item(self, response):

        hxs = HtmlXPathSelector(response)

        base_path = '//div[@class="post"]'

        recipes_scopes = hxs.select(base_path)

        name_path = 'h2/a[@rel="bookmark"]/text()'
        image_path = '(//div[@class="entry"]/p/a[@title]/img/@src)[1]'
        description_path = 'div[@class="entry"]/text()'
        ingredients_path = 'div[@class="entry"]/p'
        datePublished = 'div[@class="date"]/text()'

        recipes = []

        for r_scope in recipes_scopes:
            il = RecipeItemLoader(item=RecipeItem())

            il.add_value('source', self.source)

            il.add_value('name', r_scope.select(name_path).extract())

            il.add_value('image', r_scope.select(image_path).extract())
            il.add_value('url', response.url)
            il.add_value(
                'description',
                ''.join(r_scope.select(description_path).extract()).strip())

            ingredient_scopes = r_scope.select(ingredients_path)
            ingredients = []
            for i_scope in ingredient_scopes:
                if ingredient_heuristic(i_scope) > RECIPE_THRESHOLD:
                    for ingredient in i_scope.select('text()'):
                        ingredients.append(ingredient.extract().strip())

            il.add_value('ingredients', ingredients)

            il.add_value('datePublished',
                         r_scope.select(datePublished).extract())

            recipes.append(il.load_item())

        return recipes
示例#26
0
    def parse_item(self, response):

        hxs = HtmlXPathSelector(response)

        base_path = """//div[@id="primary_content"]"""

        recipes_scopes = hxs.select(base_path)

        name_path = '//h1[@class="fn"]/text()'
        description_path = '//meta[@property="og:description"]/@content'
        url_path = '//meta[@property="og:url"]/@content'
        image_path = '//meta[@property="og:image"][1]/@content'
        time_path = './/p[@class="summary_data"][contains(text(), "Prep Time")]/text()'
        recipeYield_path = '//span[@class="yield"]/text()'
        ingredients_path = '*//*[@class="ingredient"]'
        datePublished_path = '//p[@id="mag_info"]/text()'

        recipes = []

        for r_scope in recipes_scopes:

            il = RecipeItemLoader(item=RecipeItem())

            il.add_value("source", self.source)

            il.add_value("name", r_scope.select(name_path).extract())
            il.add_value("image", r_scope.select(image_path).extract())
            il.add_value("url", r_scope.select(url_path).extract())
            il.add_value("description", r_scope.select(description_path).extract())

            # time isn't stored in semantic markup on this site, which
            # makes it a pretty big disaster. ickiness ahead
            time_str = "".join(r_scope.select(time_path).extract())
            if time_str.strip():
                prep_pattern = "\s?Prep Time:\s?(\d{1,}\s(?:second|minute|hour|day)s?)"
                prep_time_re = re.match(prep_pattern, time_str, re.I)
                if prep_time_re:
                    il.add_value("prepTime", prep_time_re.group(1))

                cook_pattern = ".+\s?Cook Time:\s?(\d{1,}\s(?:second|minute|hour|day)s?)"
                cook_time_re = re.match(cook_pattern, time_str, re.I)
                if cook_time_re:
                    il.add_value("cookTime", cook_time_re.group(1))

            il.add_value("recipeYield", r_scope.select(recipeYield_path).extract())

            # the ingredients are pretty well formatted here, but we do need
            # to trim some trailing whitespace
            ingredient_scopes = r_scope.select(ingredients_path)
            ingredients = []
            for i_scope in ingredient_scopes:
                ingredient = i_scope.select("text()").extract()
                ingredient = "".join(ingredient)
                ingredients.append(ingredient)
            il.add_value("ingredients", ingredients)

            # Date Published is formatted as [Category] | MMM YYYY
            # Split this into a tuple on the | and keep the last part
            datePublished = r_scope.select(datePublished_path).extract()
            datePublished = "".join(datePublished).partition("|")[2]
            il.add_value("datePublished", datePublished)

            recipes.append(il.load_item())

        return recipes
    def parse_item(self, response):
        hxs = HtmlXPathSelector(response)

        # blockquote containing the recipe has multiple classes
        # specify that it must contain the class hrecipe
        base_path = """//blockquote[contains(concat(' ', normalize-space(@class), ' '), ' hrecipe ')]"""

        recipes_scopes = hxs.select(base_path)

        name_path = '//*[@class="fn"]/text()'
        description_path = '//*[@class="summary"]/p/text()'
        image_path = '//img[@class="photo"]/@src'
        prepTime_path = '//*[@class="preptime"]/text()'
        cookTime_path = '//*[@class="cooktime"]/text()'
        totalTime_path = '//*[@class="duration"]/text()'
        recipeYield_path = '//*[@class="yield"]/text()'
        ingredients_path = '//*[@class="ingredient"]/p/text() | //*[@class="ingredient"]/span/text()'

        recipes = []

        for r_scope in recipes_scopes:
            il = RecipeItemLoader(item=RecipeItem())
            il.add_value('source', self.source)
            il.add_value('name', r_scope.select(name_path).extract())
            il.add_value('url', response.url)
            il.add_value('image', r_scope.select(image_path).extract())
            il.add_value('description', r_scope.select(description_path).extract())
            il.add_value('prepTime', r_scope.select(prepTime_path).extract())
            il.add_value('cookTime', r_scope.select(cookTime_path).extract())
            il.add_value('totalTime', r_scope.select(totalTime_path).extract())
            il.add_value('recipeYield', r_scope.select(recipeYield_path).extract())
            il.add_value('ingredients', r_scope.select(ingredients_path).extract())
            recipes.append(il.load_item())

        return recipes
    def parse_item(self, response):

        hxs = HtmlXPathSelector(response)

        base_path = '//*[@id="recipe"]'

        recipes_scopes = hxs.select(base_path)

        name_path = '//*[@class="row page_title clearfix"]/h2/text()'
        description_path = '//*[@class="entry"]/p//text()'
        image_path = '//*[@class="featured_image"]/img[@class="image"]/@src'
        recipeYield_path = '//*[@class="breakdown"]/tbody/tr[1]/td[1]/text()'
        ingredients_path = '//*[@class="ingredients"]/tr'

        recipes = []

        for r_scope in recipes_scopes:
            il = RecipeItemLoader(item=RecipeItem())

            il.add_value('source', self.source)

            il.add_value('name', r_scope.select(name_path).extract())
            il.add_value('image', r_scope.select(image_path).extract())
            il.add_value('url', response.url)
            il.add_value('description',
                         r_scope.select(description_path).extract())

            il.add_value('recipeYield',
                         r_scope.select(recipeYield_path).extract())

            # this gives us a list of TRs
            ingredient_scopes = r_scope.select(ingredients_path)
            ingredients = []

            # iterate over each TR scope and extract out the TDs + combine
            # the HTML will stripped in the pipeline
            for i_scope in ingredient_scopes:
                ingr_row = i_scope.select('td').extract()
                ingredient_str = " ".join(ingr_row).strip()
                ingredients.append(ingredient_str)
            il.add_value('ingredients', ingredients)

            recipes.append(il.load_item())

        return recipes
示例#29
0
    def parse_item(self, response):
        hxs = HtmlXPathSelector(response)

        # site has many recipes missing the semantmic markup, but not worth
        # pursuing those IMHO. use hrecipe
        base_path = """//*[@class="hrecipe"]"""

        recipes_scopes = hxs.select(base_path)

        name_path = './/*[@class="fn"]/text()'
        url_path = '//meta[@property="og:url"]/@content'
        image_path = '//meta[@property="og:image"][1]/@content'
        recipeYield_path = './/*[@class="yield"]/text()'
        ingredients_path = '*//*[@class="ingredient"]'

        # get the date from rest of page, not under hrecipe
        datePublished_path = '//*[@class="date"][1]'

        recipes = []

        for r_scope in recipes_scopes:
            il = RecipeItemLoader(item=RecipeItem())

            il.add_value('source', self.source)

            il.add_value('name', r_scope.select(name_path).extract())
            il.add_value('image', r_scope.select(image_path).extract())
            il.add_value('url', r_scope.select(url_path).extract())

            il.add_value('recipeYield',
                         r_scope.select(recipeYield_path).extract())

            ingredient_scopes = r_scope.select(ingredients_path)
            ingredients = []
            for i_scope in ingredient_scopes:
                amount = i_scope.select('*[@class="amount"]/text()').extract()
                name = i_scope.select('*[@class="name"]/text()').extract()
                amount = "".join(amount).strip()
                name = "".join(name).strip()
                ingredients.append("%s %s" % (amount, name))
            il.add_value('ingredients', ingredients)

            il.add_value('datePublished',
                         r_scope.select(datePublished_path).extract())

            recipes.append(il.load_item())

        return recipes
示例#30
0
    def parse_item(self, response):
        hxs = HtmlXPathSelector(response)

        base_path = '//*[@itemtype="http://schema.org/Recipe"]'

        recipes_scopes = hxs.select(base_path)

        name_path = '//*[@itemprop="name"]/text()'
        description_path = '//*[@itemprop="description"]/text()'
        url_path = '//meta[@property="og:url"]/@content'
        image_path = '//*[@itemprop="image"]/@src'
        recipeYield_path = '*//*[@itemprop="recipeYield"]/text()'

        prepTime_path = '//*[@itemprop="prepTime"]'
        cookTime_path = '//*[@itemprop="cookTime"]'

        ingredients_path = '//*[@itemprop="ingredients"]'

        recipes = []
        for r_scope in recipes_scopes:
            il = RecipeItemLoader(item=RecipeItem())
            il.add_value('source', 'allrecipes')
            il.add_value('name', r_scope.select(name_path).extract())
            il.add_value('image', r_scope.select(image_path).extract())
            il.add_value('url', r_scope.select(url_path).extract())
            il.add_value('description', r_scope.select(description_path).extract())

            prepTime = r_scope.select(prepTime_path)
            il.add_value('prepTime', parse_iso_date(prepTime))

            cookTime = r_scope.select(cookTime_path)
            il.add_value('cookTime', parse_iso_date(cookTime))
            il.add_value('recipeYield', r_scope.select(recipeYield_path).extract())

            ingredient_scopes = r_scope.select(ingredients_path)
            ingredients = []
            for i_scope in ingredient_scopes:
                components = i_scope.select('node()/text()').extract()
                ingredients.append(' '.join(components))

            il.add_value('ingredients', ingredients)

            recipes.append(il.load_item())

        return recipes
示例#31
0
    def parse_item(self, response):

        hxs = HtmlXPathSelector(response)

        base_path = """//div[@id="primary_content"]"""

        recipes_scopes = hxs.select(base_path)

        name_path = '//h1[@class="fn"]/text()'
        description_path = '//meta[@property="og:description"]/@content'
        url_path = '//meta[@property="og:url"]/@content'
        image_path = '//meta[@property="og:image"][1]/@content'
        time_path = './/p[@class="summary_data"][contains(text(), "Prep Time")]/text()'
        recipeYield_path = '//span[@class="yield"]/text()'
        ingredients_path = '*//*[@class="ingredient"]'
        datePublished_path = '//p[@id="mag_info"]/text()'

        recipes = []

        for r_scope in recipes_scopes:

            il = RecipeItemLoader(item=RecipeItem())

            il.add_value('source', self.source)

            il.add_value('name', r_scope.select(name_path).extract())
            il.add_value('image', r_scope.select(image_path).extract())
            il.add_value('url', r_scope.select(url_path).extract())
            il.add_value('description',
                         r_scope.select(description_path).extract())

            # time isn't stored in semantic markup on this site, which
            # makes it a pretty big disaster. ickiness ahead
            time_str = "".join(r_scope.select(time_path).extract())
            if (time_str.strip()):
                prep_pattern = '\s?Prep Time:\s?(\d{1,}\s(?:second|minute|hour|day)s?)'
                prep_time_re = re.match(prep_pattern, time_str, re.I)
                if (prep_time_re):
                    il.add_value('prepTime', prep_time_re.group(1))

                cook_pattern = '.+\s?Cook Time:\s?(\d{1,}\s(?:second|minute|hour|day)s?)'
                cook_time_re = re.match(cook_pattern, time_str, re.I)
                if (cook_time_re):
                    il.add_value('cookTime', cook_time_re.group(1))

            il.add_value('recipeYield',
                         r_scope.select(recipeYield_path).extract())

            # the ingredients are pretty well formatted here, but we do need
            # to trim some trailing whitespace
            ingredient_scopes = r_scope.select(ingredients_path)
            ingredients = []
            for i_scope in ingredient_scopes:
                ingredient = i_scope.select('text()').extract()
                ingredient = "".join(ingredient)
                ingredients.append(ingredient)
            il.add_value('ingredients', ingredients)

            # Date Published is formatted as [Category] | MMM YYYY
            # Split this into a tuple on the | and keep the last part
            datePublished = r_scope.select(datePublished_path).extract()
            datePublished = "".join(datePublished).partition("|")[2]
            il.add_value('datePublished', datePublished)

            recipes.append(il.load_item())

        return recipes
示例#32
0
    def parse_item(self, response):

        hxs = HtmlXPathSelector(response)

        base_path = '//*[@id="recipe"]'

        recipes_scopes = hxs.select(base_path)

        name_path = '//*[@class="row page_title clearfix"]/h2/text()'
        description_path = '//*[@class="entry"]/p//text()'
        image_path = '//*[@class="featured_image"]/img[@class="image"]/@src'
        recipeYield_path = '//*[@class="breakdown"]/tbody/tr[1]/td[1]/text()'
        ingredients_path = '//*[@class="ingredients"]/tr'

        recipes = []

        for r_scope in recipes_scopes:
            il = RecipeItemLoader(item=RecipeItem())

            il.add_value('source', self.source)

            il.add_value('name', r_scope.select(name_path).extract())
            il.add_value('image', r_scope.select(image_path).extract())
            il.add_value('url', response.url)
            il.add_value('description', r_scope.select(description_path).extract())

            il.add_value('recipeYield', r_scope.select(recipeYield_path).extract())

            # this gives us a list of TRs
            ingredient_scopes = r_scope.select(ingredients_path)
            ingredients = []

            # iterate over each TR scope and extract out the TDs + combine
            # the HTML will stripped in the pipeline
            for i_scope in ingredient_scopes:
                ingr_row = i_scope.select('td').extract()
                ingredient_str = " ".join(ingr_row).strip()
                ingredients.append(ingredient_str)
            il.add_value('ingredients', ingredients)

            recipes.append(il.load_item())

        return recipes
    def parse_item(self, response):

        hxs = HtmlXPathSelector(response)

        base_path = '//*[@class="recipe hrecipe"]'

        recipes_scopes = hxs.select(base_path)

        name_path = '//*[@class="fn"]/text()'
        description_path = '//*[@class="summary"]/p/text()'
        image_path = '//p[1]/span/img/@src'
        prepTime_path = '//*[@class="preptime"]/text()'
        cookTime_path = './/*[@class="cooktime"]/text()'
        recipeYield_path = '//*[@class="yield"]/text()'
        ingredients_path = './/*[@class="ingredient"]/p/text()'
        #same formatting as forthelovecooking, so kind of odd.
        #datePublished = 'TODO'

        recipes = []

        label_regex = re.compile(r'^For ')

        for r_scope in recipes_scopes:
            il = RecipeItemLoader(item=RecipeItem())

            il.add_value('source', self.source)

            il.add_value('name', r_scope.select(name_path).extract())
            il.add_value('image', r_scope.select(image_path).extract())
            il.add_value('url', response.url)
            il.add_value('description', r_scope.select(description_path).extract())

            il.add_value('prepTime', r_scope.select(prepTime_path).extract())
            il.add_value('cookTime', r_scope.select(cookTime_path).extract())
            il.add_value('recipeYield', r_scope.select(recipeYield_path).extract())

            ingredient_scopes = r_scope.select(ingredients_path)
            ingredients = []
            for i_scope in ingredient_scopes:
                ingredient = i_scope.extract().strip()
                if not label_regex.match(ingredient) and not ingredient.endswith(':'):
                    ingredients.append(ingredient)
            il.add_value('ingredients', ingredients)

            #il.add_value('datePublished', r_scope.select(datePublished).extract())

            recipes.append(il.load_item())

        return recipes
示例#34
0
    def parse_item(self, response):

        hxs = HtmlXPathSelector(response)

        base_path = '//article[@itemtype="http://data-vocabulary.org/Recipe"]'

        recipes_scopes = hxs.select(base_path)

        name_path = '//h1[@itemprop="name"]/text()'
        description_path = '//meta[@name="description"]/@content'
        image_path = '//img[@itemprop="photo"]/@src'
        prepTime_path = '//span[@itemprop="prepTime"]/text()'
        cookTime_path = '//span[@itemprop="cookTime"]/text()'
        recipeYield_path = '//span[@itemprop="yield"]/text()'
        ingredients_path = '//li[@itemprop="ingredient"]'
        ingredients_amounts_path = './span[@itemprop="amount"]/span/text()'
        ingredients_names_path = './span[@itemprop="name"]/text()'

        recipes = []

        for r_scope in recipes_scopes:
            il = RecipeItemLoader(item=RecipeItem())

            il.add_value('source', self.source)

            il.add_value('name', r_scope.select(name_path).extract())
            il.add_value('image', r_scope.select(image_path).extract())
            il.add_value('url', response.url)
            il.add_value('description', r_scope.select(description_path).extract())

            il.add_value('prepTime', r_scope.select(prepTime_path).extract())
            il.add_value('cookTime', r_scope.select(cookTime_path).extract())
            il.add_value('recipeYield', r_scope.select(recipeYield_path).extract())

            # then combine them into a string.
            ingredient_scopes = r_scope.select(ingredients_path)
            amount = ingredient_scopes.select(ingredients_amounts_path).extract()
            name = ingredient_scopes.select(ingredients_names_path).extract()
            ingredients = [" ".join(ing).encode('utf-8') for ing in zip(amount, name)]
            il.add_value('ingredients', ingredients)

            recipes.append(il.load_item())

        return recipes
    def parse_item(self, response):

        hxs = HtmlXPathSelector(response)

        base_path = '//*[@class="post hrecipe"]'

        recipes_scopes = hxs.select(base_path)

        name_path = '//*[@class="title fn"]/text()'
        image_path = '//*[@class="photo"]/@src'
        ingredients_path = '//ul[@class="ingredient_list"]/li/text()'

        recipes = []

        for r_scope in recipes_scopes:
            il = RecipeItemLoader(item=RecipeItem())

            il.add_value('source', self.source)

            il.add_value('name', r_scope.select(name_path).extract())
            il.add_value('image', r_scope.select(image_path).extract())
            il.add_value('url', response.url)

            il.add_value('ingredients',
                         r_scope.select(ingredients_path).extract())

            recipes.append(il.load_item())

        return recipes
示例#36
0
    def parse_item(self, response):

        # we use this to run XPath commands against the HTML in the response
        hxs = HtmlXPathSelector(response)

        # this is the base XPath string for the element that contains the recipe
        # info
        base_path = """//body"""

        # the select() method will return a list of HtmlXPathSelector objects.
        # On this site we will almost certainly either get back just one, if
        # any exist on the page
        recipes_scopes = hxs.select(base_path)

        # it's easier to define these XPath strings outside of the loop below
        name_path = '//h1[@itemprop="name"]/text()'
        recipeYield_path = '//span[@itemprop="yield"]/text()'
        description_path = '//meta[@name="description"]/@content'
        image_path = '//img[@class="recipe_image"]/@src'
        cookTime_path = '//time[@itemprop="totalTime"]'
        prepTime_path = '//time[@itemprop="activeTime"]'

        # There are some inconsistencies in the format of ingredients,
        # so we'll scrape both: if the first yields nothing, we go
        # with the second.
        ingredients_path = '//span[@itemprop="ingredient"]'
        ingredients_alt_path = '//div[@id="ingredients"]/ul/li/text()'

        # init an empty list
        recipes = []

        # loop through our recipe scopes and extract the recipe data from each
        for r_scope in recipes_scopes:
            # make an empty RecipeItem
            il = RecipeItemLoader(item=RecipeItem())

            il.add_value('source', self.source)

            il.add_value('name', r_scope.select(name_path).extract())
            il.add_value('image', r_scope.select(image_path).extract())
            il.add_value('description', r_scope.select(description_path).extract())
            il.add_value('url', response.url)
            il.add_value('prepTime', parse_iso_date(r_scope.select(prepTime_path)))
            il.add_value('cookTime', parse_iso_date(r_scope.select(cookTime_path)))
            il.add_value('recipeYield', r_scope.select(recipeYield_path).extract())

            ingredient_scopes = r_scope.select(ingredients_path)
            ingredients = []
            for i_scope in ingredient_scopes:
                ingredient = i_scope.select('node()/text() | text()').extract()
                ingredients.append(' '.join(i.strip() for i in ingredient).encode('utf-8'))

            # Again, checking to see if our first XPath was a failure.
            if not ingredients:
                ingredient_scopes = r_scope.select(ingredients_alt_path)
                for i_scope in ingredient_scopes:
                    ingredients.append(i_scope.extract().strip().encode('utf-8'))

            il.add_value('ingredients', ingredients)

            # stick this RecipeItem in the array of recipes we will return
            recipes.append(il.load_item())

        # more processing is done by the openrecipes.pipelines. Look at that
        # file to see transforms that are applied to each RecipeItem
        return recipes
示例#37
0
    def parse_item(self, response):

        hxs = HtmlXPathSelector(response)
        base_path = """//blockquote[@class="recipe"]"""
        recipes_scopes = hxs.select(base_path)

        name_path = '//meta[@property="og:title"]/@content'
        url_path = '//meta[@property="og:url"]/@content'
        description_path = '//meta[@property="og:description"]/@content'
        image_path = '//meta[@property="og:image"][1]/@content'
        prepTime_path = '*//*[@itemprop="prepTime"]/@content'
        cookTime_path = '*//*[@itemprop="cookTime"]/@content'
        recipeYield_path = '*//*[@itemprop="recipeYield"]/text()'
        ingredients_path = '*//*[@itemprop="ingredients"]'
        datePublished = '//p[@class="date"]/text()'

        recipes = []

        for r_scope in recipes_scopes:
            il = RecipeItemLoader(item=RecipeItem())

            il.add_value('source', self.source)

            il.add_value('name', r_scope.select(name_path).extract())
            il.add_value('image', r_scope.select(image_path).extract())
            il.add_value('url', r_scope.select(url_path).extract())
            il.add_value('description', r_scope.select(description_path).extract())
            il.add_value('prepTime', r_scope.select(prepTime_path).extract())
            il.add_value('cookTime', r_scope.select(cookTime_path).extract())
            il.add_value('recipeYield', r_scope.select(recipeYield_path).extract())

            ingredient_scopes = r_scope.select(ingredients_path)
            ingredients = []
            for i_scope in ingredient_scopes:
                ind = i_scope.extract()
                ind = ind.strip()
                ingredients.append("%s " % (ind))
            il.add_value('ingredients', ingredients)

            il.add_value('datePublished', r_scope.select(datePublished).extract())

            recipes.append(il.load_item())

        return recipes
示例#38
0
    def parse_item(self, response):

        hxs = HtmlXPathSelector(response)

        base_path = '//*[@id="recipe"]'

        recipes_scopes = hxs.select(base_path)

        name_path = '//*[@class="row page_title clearfix"]/h2/text()'
        description_path = '//*[@class="entry"]/p//text()'
        image_path = '//*[@class="featured_image"]/img[@class="image"]/@src'
        recipeYield_path = '//*[@class="breakdown"]/tbody/tr[1]/td[1]/text()'
        ingredients_path = '*//*[@class="ingredients"]'
        #the site only offers total time, so prep and cook is combined
        #prepTime_path = ''
        # timezone warning, that is over my head at this point
        #cookTime_path = '//*[@class="cook_time"]'
        # datePublished = 'TODO' not available

        recipes = []

        for r_scope in recipes_scopes:
            il = RecipeItemLoader(item=RecipeItem())

            #il.add_value('source', self.source)

            il.add_value('name', r_scope.select(name_path).extract())
            il.add_value('image', r_scope.select(image_path).extract())
            il.add_value('url', response.url)
            il.add_value('description', r_scope.select(description_path).extract())

            # il.add_value('prepTime', r_scope.select(prepTime_path).extract())
            #il.add_value('cookTime', r_scope.select(cookTime_path).extract())
            il.add_value('recipeYield', r_scope.select(recipeYield_path).extract())
            #il.add_value('ingredients', r_scope.select(ingredients_path).extract())
            ingredient_scopes = r_scope.select(ingredients_path)
            ingredients = []
            for i_scope in ingredient_scopes:
                amount = i_scope.select('//td/strong').extract()
                name = i_scope.select('//*[@class="ingredients"]/tbody/tr/td/text()').extract()
                amount = "".join(amount).strip()
                name = "".join(name).strip()
                ingredients.append("%s %s" % (amount, name))
            il.add_value('ingredients', ingredients)
            # il.add_value('datePublished', r_scope.select(datePublished).extract())

            recipes.append(il.load_item())

        return recipes
示例#39
0
    def parse_item(self, response):

        # we use this to run XPath commands against the HTML in the response
        hxs = HtmlXPathSelector(response)

        # this is the base XPath string for the element that contains the recipe
        # info
        base_path = """//article[@class="hrecipe"]"""

        # the select() method will return a list of HtmlXPathSelector objects.
        # On this site we will almost certainly either get back just one, if
        # any exist on the page
        recipes_scopes = hxs.select(base_path)

        # it's easier to define these XPath strings outside of the loop below
        name_path = '//h1/text()'
        recipeYield_path = '//span[@class="info yield"]/text()'
        image_path = '//section[@class="content-unit"]/img/@src'
        prepTime_path = '//span[@class="info preptime"]/text()'
        cookTime_path = '//span[@class="info duration"]/text()'
        ingredients_path = '//div[@class="ingredients-section"]/ul/li/span/text()'
        datePublished = '//footer/time/text()'

        # init an empty list
        recipes = []

        # loop through our recipe scopes and extract the recipe data from each
        for r_scope in recipes_scopes:
            # make an empty RecipeItem
            il = RecipeItemLoader(item=RecipeItem())

            il.add_value('source', self.source)

            il.add_value('name', r_scope.select(name_path).extract())
            il.add_value('image', r_scope.select(image_path).extract())
            il.add_value('url', response.url)

            il.add_value('prepTime', r_scope.select(prepTime_path).extract())
            il.add_value('cookTime', r_scope.select(cookTime_path).extract())
            il.add_value('recipeYield',
                         r_scope.select(recipeYield_path).extract())

            ingredient_scopes = r_scope.select(ingredients_path)
            ingredients = []
            for i_scope in ingredient_scopes:
                ingredients.append(i_scope.extract())
            il.add_value('ingredients', ingredients)

            il.add_value('datePublished',
                         r_scope.select(datePublished).extract())

            # stick this RecipeItem in the array of recipes we will return
            recipes.append(il.load_item())

        # more processing is done by the openrecipes.pipelines. Look at that
        # file to see transforms that are applied to each RecipeItem
        return recipes
示例#40
0
    def parse_item(self, response):

        hxs = HtmlXPathSelector(response)
        base_path = """//div[@id="zlrecipe-innerdiv"]"""
        recipes_scopes = hxs.select(base_path)

        name_path = '*//*[@itemprop="name"]/text()'
        url_path = '//link[@rel="canonical"]/@href'
        image_path = '//meta[@property="og:image"][1]/@content'

        prepTime_path = '*//*[@itemprop="prepTime"]/@content'
        cookTime_path = '*//*[@itemprop="cookTime"]/@content'
        recipeYield_path = '*//*[@itemprop="recipeYield"]/text()'

        ingredients_path = '*//*[@itemprop="ingredients"]'
        datePublished = '//*[@class="time_stamp_month"]'

        recipes = []

        for r_scope in recipes_scopes:
            il = RecipeItemLoader(item=RecipeItem())

            il.add_value('source', self.source)

            il.add_value('name', r_scope.select(name_path).extract())
            il.add_value('image', r_scope.select(image_path).extract())
            il.add_value('url', r_scope.select(url_path).extract())

            il.add_value('prepTime', r_scope.select(prepTime_path).extract())
            il.add_value('cookTime', r_scope.select(cookTime_path).extract())
            il.add_value('recipeYield', r_scope.select(recipeYield_path).extract())

            ingredient_scopes = r_scope.select(ingredients_path)
            ingredients = []
            for i_scope in ingredient_scopes:
                ind = i_scope.select('.//text()').extract()
                ingredients.append(''.join(ind).strip())
            il.add_value('ingredients', ingredients)

            il.add_value('datePublished', r_scope.select(datePublished).extract())

            recipes.append(il.load_item())

        return recipes
示例#41
0
    def parse_item(self, response):

        # we use this to run XPath commands against the HTML in the response
        hxs = HtmlXPathSelector(response)

        # this is the base XPath string for the element that contains the recipe
        # info
        base_path = """//article[@class="hrecipe"]"""

        # the select() method will return a list of HtmlXPathSelector objects.
        # On this site we will almost certainly either get back just one, if
        # any exist on the page
        recipes_scopes = hxs.select(base_path)

        # it's easier to define these XPath strings outside of the loop below
        name_path = '//h1/text()'
        recipeYield_path = '//span[@class="info yield"]/text()'
        image_path = '//section[@class="content-unit"]/img/@src'
        prepTime_path = '//span[@class="info preptime"]/text()'
        cookTime_path = '//span[@class="info duration"]/text()'
        ingredients_path = '//div[@class="ingredients-section"]/ul/li/span/text()'
        datePublished = '//footer/time/text()'

        # init an empty list
        recipes = []

        # loop through our recipe scopes and extract the recipe data from each
        for r_scope in recipes_scopes:
            # make an empty RecipeItem
            il = RecipeItemLoader(item=RecipeItem())

            il.add_value('source', self.source)

            il.add_value('name', r_scope.select(name_path).extract())
            il.add_value('image', r_scope.select(image_path).extract())
            il.add_value('url', response.url)

            il.add_value('prepTime', r_scope.select(prepTime_path).extract())
            il.add_value('cookTime', r_scope.select(cookTime_path).extract())
            il.add_value('recipeYield', r_scope.select(recipeYield_path).extract())

            ingredient_scopes = r_scope.select(ingredients_path)
            ingredients = []
            for i_scope in ingredient_scopes:
                ingredients.append(i_scope.extract())
            il.add_value('ingredients', ingredients)

            il.add_value('datePublished', r_scope.select(datePublished).extract())

            # stick this RecipeItem in the array of recipes we will return
            recipes.append(il.load_item())

        # more processing is done by the openrecipes.pipelines. Look at that
        # file to see transforms that are applied to each RecipeItem
        return recipes
示例#42
0
    def parse_item(self, response):
        hxs = HtmlXPathSelector(response)

        base_path = """//div[@id="blq-main"]"""

        recipes_scopes = hxs.select(base_path)

        name_path = '//h1/text()'
        description_path = '//div[@id="description"]//span[@class="summary"]/text()'
        image_path = '//img[@id="food-image"]/@src'
        prepTime_path = '//span[@class="prepTime"]/span[@class="value-title"]/@title'
        cookTime_path = '//span[@class="cookTime"]/span[@class="value-title"]/@title'
        recipeYield_path = '//h3[@class="yield"]/text()'
        ingredients_path = '//p[@class="ingredient"]'

        recipes = []

        for r_scope in recipes_scopes:
            il = RecipeItemLoader(item=RecipeItem())

            il.add_value('source', self.source)

            il.add_value('name', r_scope.select(name_path).extract())
            il.add_value('image', r_scope.select(image_path).extract())
            il.add_value('url', response.url)
            il.add_value('description',
                         r_scope.select(description_path).extract())

            il.add_value('prepTime', r_scope.select(prepTime_path).extract())
            il.add_value('cookTime', r_scope.select(cookTime_path).extract())
            il.add_value('recipeYield',
                         r_scope.select(recipeYield_path).extract())

            ingredient_scopes = r_scope.select(ingredients_path)
            ingredients = []
            for i_scope in ingredient_scopes:
                amount = i_scope.select('text()[1]').extract()
                name = i_scope.select('a/text()').extract()
                amount = "".join(amount).strip()
                name = "".join(name).strip()
                ingredients.append("%s %s" % (amount, name))
            il.add_value('ingredients', ingredients)

            recipes.append(il.load_item())

        return recipes
    def parse_item(self, response):

        # we use this to run XPath commands against the HTML in the response
        hxs = HtmlXPathSelector(response)

        # this is the base XPath string for the element that contains the recipe
        # info
        base_path = """//div[@class="recipe-details"]"""

        # the select() method will return a list of HtmlXPathSelector objects.
        # On this site we will almost certainly either get back just one, if
        # any exist on the page
        recipes_scopes = hxs.select(base_path)

        # it's easier to define these XPath strings outside of the loop below
        name_path = '//h1[@itemprop="name"]/text()'
        recipeYield_path = '//label[@for="set_servings"]/input/@value'
        description_path = '//span[@itemprop="summary"]/p/text()'
        image_path = '//img[@class="the_recipe_image"]/@src'
        cookTime_path = '//form/p/time[@itemprop="cookTime"]/@datetime'
        prepTime_path = '//form/p/time[@itemprop="prepTime"]/@datetime'
        ingredients_path = '//span[@itemprop="ingredient"]'
        ingredients_amounts_path = './span[@itemprop="amount"]/text()'
        ingredients_names_path = './span[@itemprop="amount"]/text()'
        datePublished_path = '//span[@itemprop="published"]/@datetime'

        # init an empty list
        recipes = []

        # loop through our recipe scopes and extract the recipe data from each
        for r_scope in recipes_scopes:
            # make an empty RecipeItem
            il = RecipeItemLoader(item=RecipeItem())

            il.add_value('source', self.source)

            il.add_value('name', r_scope.select(name_path).extract())
            il.add_value('image', r_scope.select(image_path).extract())
            il.add_value('description', r_scope.select(description_path).extract())
            il.add_value('url', response.url)
            il.add_value('prepTime', r_scope.select(prepTime_path).extract())
            il.add_value('cookTime', r_scope.select(cookTime_path).extract())
            il.add_value('recipeYield', r_scope.select(recipeYield_path).extract())
            il.add_value('datePublished', r_scope.select(datePublished_path).extract())

            # Simpler to grab the amount and name spans separately,
            # then combine them into a string.
            ingredient_scopes = r_scope.select(ingredients_path)
            amount = ingredient_scopes.select(ingredients_amounts_path).extract()
            name = ingredient_scopes.select(ingredients_names_path).extract()
            ingredients = [" ".join(ing).encode('utf-8') for ing in zip(amount, name)]

            il.add_value('ingredients', ingredients)

            # stick this RecipeItem in the array of recipes we will return
            recipes.append(il.load_item())

        # more processing is done by the openrecipes.pipelines. Look at that
        # file to see transforms that are applied to each RecipeItem
        return recipes
    def parse_item(self, response):

        hxs = HtmlXPathSelector(response)

        base_path = """//*[contains(concat(' ', normalize-space(@class), ' '),
                        ' hrecipe ')]"""
        recipes_scopes = hxs.select(base_path)

        name_path = '//*[@class="fn"]/text()'
        description_path = '//*[@class="recipe-description summary"]/p/text()'
        image_path = '//img[@class="photo"]/@src'
        recipeYield_path = '//*[@class="directions"]/p/text()'
        ingredients_path = '//*[@class="ingredient"]/text()'

        recipes = []

        for r_scope in recipes_scopes:
            il = RecipeItemLoader(item=RecipeItem())

            il.add_value('source', self.source)

            il.add_value('name', r_scope.select(name_path).extract())
            il.add_value('image', r_scope.select(image_path).extract())
            il.add_value('url', response.url)
            il.add_value('description',
                         r_scope.select(description_path).extract())

            # yield given somewhere in description 'Serves n.'
            il.add_value('recipeYield',
                         r_scope.select(recipeYield_path).re('Serves \d\.'))

            il.add_value('ingredients',
                         r_scope.select(ingredients_path).extract())

            recipes.append(il.load_item())

        return recipes
    def parse_item(self, response):
        hxs = HtmlXPathSelector(response)

        base_path = """//div[@id="recipe"]"""

        recipes_scopes = hxs.select(base_path)

        name_path = 'h1/text()'
        description_path = '//meta[@property="og:description"]/@content'
        url_path = '//meta[@property="og:url"]/@content'
        image_path = '//meta[@property="og:image"][1]/@content'
        prepTime_path = './/span[@class="preptime"]/span[@class="value-title"]/@title'
        cookTime_path = './/span[@class="cooktime"]/span[@class="value-title"]/@title'

        # super inconsistent in how the yield is formatted
        recipeYield_path = "|".join([
            '//div[@id="recipe"]/p[starts-with(i,"Makes")]/i',
            '//div[@id="recipe"]/p[starts-with(i,"Serves")]/i',
            '//div[@id="recipe"]/p[starts-with(em,"Makes")]/em',
            '//div[@id="recipe"]/p[starts-with(em,"Serves")]/em',
            '//div[@id="recipe"][starts-with(p,"Makes")]/p',
            '//div[@id="recipe"][starts-with(p,"Serves")]/p',
        ])
        ingredients_path = 'blockquote/*'
        datePublished = '//span[@class="published"]/span[@class="value-title"]/@title'

        recipes = []
        for r_scope in recipes_scopes:
            il = RecipeItemLoader(item=RecipeItem())
            il.add_value('source', self.source)

            il.add_value('name', r_scope.select(name_path).extract())
            il.add_value('image', r_scope.select(image_path).extract())
            il.add_value('url', r_scope.select(url_path).extract())
            il.add_value('description',
                         r_scope.select(description_path).extract())

            il.add_value('prepTime', r_scope.select(prepTime_path).extract())
            il.add_value('cookTime', r_scope.select(cookTime_path).extract())
            il.add_value('recipeYield',
                         r_scope.select(recipeYield_path).extract())

            il.add_value('ingredients',
                         r_scope.select(ingredients_path).extract())

            il.add_value('datePublished',
                         r_scope.select(datePublished).extract())

            recipes.append(il.load_item())

        return recipes
    def parse_item(self, response):

        hxs = HtmlXPathSelector(response)

        base_path = '//*[@class="post hrecipe"]'

        recipes_scopes = hxs.select(base_path)

        name_path = '//*[@class="title fn"]/text()'
        image_path = '//*[@class="photo"]/@src'
        ingredients_path = '//ul[@class="ingredient_list"]/li/text()'

        recipes = []

        for r_scope in recipes_scopes:
            il = RecipeItemLoader(item=RecipeItem())

            il.add_value('source', self.source)

            il.add_value('name', r_scope.select(name_path).extract())
            il.add_value('image', r_scope.select(image_path).extract())
            il.add_value('url', response.url)

            il.add_value('ingredients', r_scope.select(ingredients_path).extract())

            recipes.append(il.load_item())

        return recipes
    def parse_item(self, response):

        hxs = HtmlXPathSelector(response)

        base_path = '//*[@class="post hrecipe"]'

        recipes_scopes = hxs.select(base_path)

        name_path = '//*[@class="title fn"]/text()'
        #  little iffy on how to do this one
        #description_path = 'TODO'
        image_path = '//*[@class="photo"]/@src'
        #  both cook and prep time not available
        #prepTime_path = 'TODO'
        #cookTime_path = 'TODO'
        #  check on diff sites
        recipeYield_path = '//blockquote/p[2]/text()'
        #ingredients_path = '//*[@class="ingredient_list"]'
        ingredients_path = '//ul[@class="ingredient_list"]/li/text()'
        datePublished = 'normalize-space(//*[@class="postmeta"]/text())'

        recipes = []

        for r_scope in recipes_scopes:
            il = RecipeItemLoader(item=RecipeItem())

            il.add_value('source', self.source)

            il.add_value('name', r_scope.select(name_path).extract())
            il.add_value('image', r_scope.select(image_path).extract())
            il.add_value('url', response.url)
            #il.add_value('description', r_scope.select(description_path).extract())

            #il.add_value('prepTime', r_scope.select(prepTime_path).extract())
            #il.add_value('cookTime', r_scope.select(cookTime_path).extract())
            il.add_value('recipeYield', r_scope.select(recipeYield_path).extract())

            ingredient_scopes = r_scope.select(ingredients_path)
            ingredients = []
            for i_scope in ingredient_scopes:
                pass
            il.add_value('ingredients', ingredients)

            il.add_value('datePublished', r_scope.select(datePublished).extract())

            recipes.append(il.load_item())

        return recipes
示例#48
0
    def parse_item(self, response):
        hxs = HtmlXPathSelector(response)

        base_path = """//*[contains(@class,'hrecipe')]"""

        recipes_scopes = hxs.select(base_path)

        name_path = '//meta[@property="og:title"]/@content'
        description_path = '//meta[@name="description"]/@content'
        url_path = '//meta[@property="og:url"]/@content'
        image_path = '//*[@itemprop="image"]/@src'
        recipeYield_path = '//div[@class="time-and-yield"]/*/span[@class="yield"]/text()'
        ingredients_path = '//ul[@class="ingredients"]/li/span[@class="ingredient"]'
        datePublished_path = '//div[@class="intro"]/div[@class="display-date"]/text()[last()]'  # skip HTML comment

        recipes = []
        for r_scope in recipes_scopes:
            il = RecipeItemLoader(item=RecipeItem())

            il.add_value('source', self.source)
            il.add_value('name', r_scope.select(name_path).extract())
            il.add_value('image', r_scope.select(image_path).extract())
            il.add_value('url', r_scope.select(url_path).extract())
            il.add_value('description', r_scope.select(description_path).extract())
            il.add_value('recipeYield', r_scope.select(recipeYield_path).extract())

            ingredients_scope = r_scope.select(ingredients_path)
            ingredients = []
            for i_scope in ingredients_scope:
                quantity = i_scope.select('span[@class="quantity"]/text()').extract()
                name = i_scope.select('span[@class="name"]/text()').extract()
                quantity = "".join(quantity).strip()
                name = "".join(name).strip()
                ingredients.append("%s %s" % (quantity, name))
            il.add_value('ingredients', ingredients)

            il.add_value('datePublished', r_scope.select(datePublished_path).extract())

            recipes.append(il.load_item())

        return recipes
    def parse_item(self, response):

        hxs = HtmlXPathSelector(response)

        base_path = '//blockquote[@class="recipe hrecipe"]'

        recipes_scopes = hxs.select(base_path)

        name_path = '//*[@class="fn"]/text()'
        description_path = '//*[@class="summary"]/p/text()'
        image_path = '//img[@class="photo"]/@src'
        prepTime_path = '//*[@class="preptime"]/text()'
        cookTime_path = '//*[@class="cooktime"]/text()'
        recipeYield_path = '//*[@class="yield"]/text()'
        ingredients_path = '//*[@class="ingredient"]/p/text()'

        recipes = []

        for r_scope in recipes_scopes:
            il = RecipeItemLoader(item=RecipeItem())

            il.add_value('source', self.source)

            il.add_value('name', r_scope.select(name_path).extract())
            il.add_value('image', r_scope.select(image_path).extract())
            il.add_value('url', response.url)
            il.add_value('description',
                         r_scope.select(description_path).extract())

            il.add_value('prepTime', r_scope.select(prepTime_path).extract())
            il.add_value('cookTime', r_scope.select(cookTime_path).extract())
            il.add_value('recipeYield',
                         r_scope.select(recipeYield_path).extract())

            il.add_value('ingredients',
                         r_scope.select(ingredients_path).extract())

            recipes.append(il.load_item())

        return recipes
    def parse_item(self, response):

        hxs = HtmlXPathSelector(response)

        base_path = '//div[@class="post"]'

        recipes_scopes = hxs.select(base_path)

        name_path = 'h2/a[@rel="bookmark"]/text()'
        image_path = '(//div[@class="entry"]/p/a[@title]/img/@src)[1]'
        description_path = 'div[@class="entry"]/text()'
        ingredients_path = 'div[@class="entry"]/p'
        datePublished = 'div[@class="date"]/text()'

        recipes = []

        for r_scope in recipes_scopes:
            il = RecipeItemLoader(item=RecipeItem())

            il.add_value('source', self.source)

            il.add_value('name', r_scope.select(name_path).extract())

            il.add_value('image', r_scope.select(image_path).extract())
            il.add_value('url', response.url)
            il.add_value('description', ''.join(r_scope.select(description_path).extract()).strip())

            ingredient_scopes = r_scope.select(ingredients_path)
            ingredients = []
            for i_scope in ingredient_scopes:
                if ingredient_heuristic(i_scope) > RECIPE_THRESHOLD:
                    for ingredient in i_scope.select('text()'):
                        ingredients.append(ingredient.extract().strip())

            il.add_value('ingredients', ingredients)

            il.add_value('datePublished', r_scope.select(datePublished).extract())

            recipes.append(il.load_item())

        return recipes
示例#51
0
    def parse_item(self, response):

        hxs = HtmlXPathSelector(response)

        base_path = '//div[@class="innerrecipe"]'

        recipes_scopes = hxs.select(base_path)

        name_path = '*//h2[@class="fn"]/text()'
        image_path = '*//img[@class="photo"]/@src'
        prepTime_path = '*//span[@class="preptime"]/text()'
        cookTime_path = '*//span[@class="cooktime"]/text()'
        totalTime_path = '*//span[@class="duration"]/text()'
        recipeYield_path = '*//span[@class="yield"]/text()'
        datePublished = '//div[@class="post fullpost singlepost"]//div[@class="postmeta"]/text()[normalize-space()]'
        ingredients_path = '*//*[@class="ingredient"]/p'

        recipes = []

        for r_scope in recipes_scopes:
            il = RecipeItemLoader(item=RecipeItem())

            il.add_value('source', self.source)

            il.add_value('name', r_scope.select(name_path).extract())
            il.add_value('image', r_scope.select(image_path).extract())
            il.add_value('url', response.url)

            il.add_value('prepTime', r_scope.select(prepTime_path).extract())
            il.add_value('cookTime', r_scope.select(cookTime_path).extract())
            il.add_value('totalTime', r_scope.select(totalTime_path).extract())
            il.add_value('recipeYield',
                         r_scope.select(recipeYield_path).extract())

            ingredient_scopes = r_scope.select(ingredients_path)
            ingredients = []
            for i_scope in ingredient_scopes:
                ingredient = i_scope.select('text()').extract()
                ingredient = "".join(ingredient)
                ingredients.append(ingredient)
            il.add_value('ingredients', ingredients)

            il.add_value('datePublished',
                         r_scope.select(datePublished).extract())

            recipes.append(il.load_item())

        return recipes
示例#52
0
    def parse_item(self, response):

        hxs = HtmlXPathSelector(response)

        base_path = '//*[@id="recipe"]'

        recipes_scopes = hxs.select(base_path)

        name_path = '//*[@class="row page_title clearfix"]/h2/text()'
        description_path = '//*[@class="entry"]/p//text()'
        image_path = '//*[@class="featured_image"]/img[@class="image"]/@src'
        recipeYield_path = '//*[@class="breakdown"]/tbody/tr[1]/td[1]/text()'
        ingredients_path = '*//*[@class="ingredients"]'
        #the site only offers total time, so prep and cook is combined
        #prepTime_path = ''
        # timezone warning, that is over my head at this point
        #cookTime_path = '//*[@class="cook_time"]'
        # datePublished = 'TODO' not available

        recipes = []

        for r_scope in recipes_scopes:
            il = RecipeItemLoader(item=RecipeItem())

            #il.add_value('source', self.source)

            il.add_value('name', r_scope.select(name_path).extract())
            il.add_value('image', r_scope.select(image_path).extract())
            il.add_value('url', response.url)
            il.add_value('description',
                         r_scope.select(description_path).extract())

            # il.add_value('prepTime', r_scope.select(prepTime_path).extract())
            #il.add_value('cookTime', r_scope.select(cookTime_path).extract())
            il.add_value('recipeYield',
                         r_scope.select(recipeYield_path).extract())
            #il.add_value('ingredients', r_scope.select(ingredients_path).extract())
            ingredient_scopes = r_scope.select(ingredients_path)
            ingredients = []
            for i_scope in ingredient_scopes:
                amount = i_scope.select('//td/strong').extract()
                name = i_scope.select(
                    '//*[@class="ingredients"]/tbody/tr/td/text()').extract()
                amount = "".join(amount).strip()
                name = "".join(name).strip()
                ingredients.append("%s %s" % (amount, name))
            il.add_value('ingredients', ingredients)
            # il.add_value('datePublished', r_scope.select(datePublished).extract())

            recipes.append(il.load_item())

        return recipes
示例#53
0
    def parse_item(self, response):
        hxs = HtmlXPathSelector(response)

        base_path = """//div[@id="blq-main"]"""

        recipes_scopes = hxs.select(base_path)

        name_path = "//h1/text()"
        description_path = '//div[@id="description"]//span[@class="summary"]/text()'
        image_path = '//img[@id="food-image"]/@src'
        prepTime_path = '//span[@class="prepTime"]/span[@class="value-title"]/@title'
        cookTime_path = '//span[@class="cookTime"]/span[@class="value-title"]/@title'
        recipeYield_path = '//h3[@class="yield"]/text()'
        ingredients_path = '//p[@class="ingredient"]'

        recipes = []

        for r_scope in recipes_scopes:
            il = RecipeItemLoader(item=RecipeItem())

            il.add_value("source", self.source)

            il.add_value("name", r_scope.select(name_path).extract())
            il.add_value("image", r_scope.select(image_path).extract())
            il.add_value("url", response.url)
            il.add_value("description", r_scope.select(description_path).extract())

            il.add_value("prepTime", r_scope.select(prepTime_path).extract())
            il.add_value("cookTime", r_scope.select(cookTime_path).extract())
            il.add_value("recipeYield", r_scope.select(recipeYield_path).extract())

            ingredient_scopes = r_scope.select(ingredients_path)
            ingredients = []
            for i_scope in ingredient_scopes:
                amount = i_scope.select("text()[1]").extract()
                name = i_scope.select("a/text()").extract()
                amount = "".join(amount).strip()
                name = "".join(name).strip()
                ingredients.append("%s %s" % (amount, name))
            il.add_value("ingredients", ingredients)

            recipes.append(il.load_item())

        return recipes
示例#54
0
    def parse_item(self, response):
        hxs = HtmlXPathSelector(response)

        base_path = """//*[contains(@class,'hrecipe')]"""

        recipes_scopes = hxs.select(base_path)

        name_path = '//meta[@property="og:title"]/@content'
        description_path = '//meta[@name="description"]/@content'
        url_path = '//meta[@property="og:url"]/@content'
        image_path = '//*[@itemprop="image"]/@src'
        recipeYield_path = '//div[@class="time-and-yield"]/*/span[@class="yield"]/text()'
        ingredients_path = '//ul[@class="ingredients"]/li/span[@class="ingredient"]'
        datePublished_path = '//div[@class="intro"]/div[@class="display-date"]/text()[last()]'  # skip HTML comment

        recipes = []
        for r_scope in recipes_scopes:
            il = RecipeItemLoader(item=RecipeItem())

            il.add_value('source', self.source)
            il.add_value('name', r_scope.select(name_path).extract())
            il.add_value('image', r_scope.select(image_path).extract())
            il.add_value('url', r_scope.select(url_path).extract())
            il.add_value('description',
                         r_scope.select(description_path).extract())
            il.add_value('recipeYield',
                         r_scope.select(recipeYield_path).extract())

            ingredients_scope = r_scope.select(ingredients_path)
            ingredients = []
            for i_scope in ingredients_scope:
                quantity = i_scope.select(
                    'span[@class="quantity"]/text()').extract()
                name = i_scope.select('span[@class="name"]/text()').extract()
                quantity = "".join(quantity).strip()
                name = "".join(name).strip()
                ingredients.append("%s %s" % (quantity, name))
            il.add_value('ingredients', ingredients)

            il.add_value('datePublished',
                         r_scope.select(datePublished_path).extract())

            recipes.append(il.load_item())

        return recipes
    def parse_item(self, response):

        hxs = HtmlXPathSelector(response)

        base_path = '//div[@itemtype="http://schema.org/Recipe"]'
        recipes_scope = hxs.select(base_path)

        ingredients_path = '//li[@itemprop="ingredients"]/text()'

        image_path = '(//div[@class="entry"]//img/@src)[1]'
        name_path = '//div[@itemprop="name"]/text()'
        url_path = '//h2[@class="title"]/a/@href'

        yield_path = '//span[@itemprop="servingSize"]/text()'
        total_time_path = '//span[@itemprop="totalTime"]/@content'

        recipes = []
        for recipe_scope in recipes_scope:

            il = RecipeItemLoader(item=RecipeItem())
            il.add_value('source', self.source)

            il.add_value('image', recipe_scope.select(image_path).extract())
            il.add_value('name', recipe_scope.select(name_path).extract())
            il.add_value('url', recipe_scope.select(url_path).extract())

            ingredients = []
            ingredient_scopes = recipe_scope.select(ingredients_path)
            for ingredient_scope in ingredient_scopes:
                ingredient = ingredient_scope.extract().strip()
                if (ingredient):
                    ingredients.append(ingredient)
            il.add_value('ingredients', ingredients)

            il.add_value('recipeYield', recipe_scope.select(yield_path).extract())
            il.add_value('totalTime', recipe_scope.select(total_time_path).extract())

            recipes.append(il.load_item())

        return recipes