示例#1
0
    def parse_item(self, response):

        hxs = HtmlXPathSelector(response)
        image_path = hxs.select("descendant-or-self::img[@class and contains(@class, 'wp-image')][1]/@data-lazy-src").extract()

        raw_recipes = parse_recipes(hxs, {'source': self.source, 'url': response.url})
        if raw_recipes:
            # schema.org.  Yay!
            for recipe in raw_recipes:
                recipe['image'] = image_path

            return [RecipeItem.from_dict(recipe) for recipe in raw_recipes]
        else:
            # not schema.org.  Boo!
            il = RecipeItemLoader(item=RecipeItem())

            il.add_value('source', self.source)
            il.add_value('url', response.url)
            il.add_value('image', image_path)

            name_path = '//*[@class="post-title"]/h1/text()'
            il.add_value('name', hxs.select(name_path).extract())
            # maybe it's in the P's
            for p in hxs.select('//div[@id="recipe" or @class="span9"]/p'):
                if is_ingredient_container(p):
                    il.add_value('ingredients', p.select('text()').extract())
            # or maybe it's in the LI's
            for li in hxs.select('//*[@class="span9"]//ul/li'):
                if is_ingredient_container(li):
                    il.add_value('ingredients', li.select('text()').extract())
            # or maybe it's in these other LI's
            for li in hxs.select('//li[@class="ingredient"]/text()'):
                il.add_value('ingredients', li.extract())
            return il.load_item()
    def parse_item(self, response):

        hxs = HtmlXPathSelector(response)

        base_path = '//*[@class="post hrecipe"]'

        recipes_scopes = hxs.select(base_path)

        name_path = '//*[@class="title fn"]/text()'
        image_path = '//*[@class="photo"]/@src'
        ingredients_path = '//ul[@class="ingredient_list"]/li/text()'

        recipes = []

        for r_scope in recipes_scopes:
            il = RecipeItemLoader(item=RecipeItem())

            il.add_value('source', self.source)

            il.add_value('name', r_scope.select(name_path).extract())
            il.add_value('image', r_scope.select(image_path).extract())
            il.add_value('url', response.url)

            il.add_value('ingredients',
                         r_scope.select(ingredients_path).extract())

            recipes.append(il.load_item())

        return recipes
示例#3
0
    def parse_item(self, response):
        raw_recipes = parse_recipes(response, {
            'source': self.source,
            'url': response.url
        })

        return [RecipeItem.from_dict(recipe) for recipe in raw_recipes]
    def parse_item(self, response):
        hxs = HtmlXPathSelector(response)
        raw_recipes = parse_recipes(hxs, {
            'source': self.source,
            'url': response.url
        })

        return [RecipeItem.from_dict(recipe) for recipe in raw_recipes]
示例#5
0
    def parse_item(self, response):

        # we use this to run XPath commands against the HTML in the response
        hxs = HtmlXPathSelector(response)

        # this is the base XPath string for the element that contains the recipe
        # info
        base_path = """//article[@class="hrecipe"]"""

        # the select() method will return a list of HtmlXPathSelector objects.
        # On this site we will almost certainly either get back just one, if
        # any exist on the page
        recipes_scopes = hxs.select(base_path)

        # it's easier to define these XPath strings outside of the loop below
        name_path = '//h1/text()'
        recipeYield_path = '//span[@class="info yield"]/text()'
        image_path = '//section[@class="content-unit"]/img/@src'
        prepTime_path = '//span[@class="info preptime"]/text()'
        cookTime_path = '//span[@class="info duration"]/text()'
        ingredients_path = '//div[@class="ingredients-section"]/ul/li/span/text()'
        datePublished = '//footer/time/text()'

        # init an empty list
        recipes = []

        # loop through our recipe scopes and extract the recipe data from each
        for r_scope in recipes_scopes:
            # make an empty RecipeItem
            il = RecipeItemLoader(item=RecipeItem())

            il.add_value('source', self.source)

            il.add_value('name', r_scope.select(name_path).extract())
            il.add_value('image', r_scope.select(image_path).extract())
            il.add_value('url', response.url)

            il.add_value('prepTime', r_scope.select(prepTime_path).extract())
            il.add_value('cookTime', r_scope.select(cookTime_path).extract())
            il.add_value('recipeYield',
                         r_scope.select(recipeYield_path).extract())

            ingredient_scopes = r_scope.select(ingredients_path)
            ingredients = []
            for i_scope in ingredient_scopes:
                ingredients.append(i_scope.extract())
            il.add_value('ingredients', ingredients)

            il.add_value('datePublished',
                         r_scope.select(datePublished).extract())

            # stick this RecipeItem in the array of recipes we will return
            recipes.append(il.load_item())

        # more processing is done by the openrecipes.pipelines. Look at that
        # file to see transforms that are applied to each RecipeItem
        return recipes
示例#6
0
    def parse_item(self, response):

        hxs = HtmlXPathSelector(response)

        base_path = '//*[@itemtype="http://schema.org/Recipe"]'

        recipes_scopes = hxs.select(base_path)

        name_path = '//*[@class="recipe-title"]/text()'
        #  not sure how to get the description consistently on this one.
        #description_path = 'TODO'
        image_path = '//*[@itemprop="image"]/@src'
        prepTime_path = '//*[@class="prep-time tooltip-element"]/number()'
        cookTime_path = '//*[@class="total-time tooltip-element"]/text()'
        recipeYield_path = '//*[@itemprop="recipeYield"]/text()'
        #may have to make ingredients more generic
        ingredients_path = '//*[@class="ingredients-list"]/ul'
        datePublished = '//*[@class="date published time"]/text()'

        recipes = []

        for r_scope in recipes_scopes:
            il = RecipeItemLoader(item=RecipeItem())

            il.add_value('source', self.source)

            il.add_value('name', r_scope.select(name_path).extract())
            il.add_value('image', r_scope.select(image_path).extract())
            il.add_value('url', response.url)
            #il.add_value('description', r_scope.select(description_path).extract())

            il.add_value('prepTime', r_scope.select(prepTime_path).extract())
            il.add_value('cookTime', r_scope.select(cookTime_path).extract())
            il.add_value('recipeYield',
                         r_scope.select(recipeYield_path).extract())

            ingredient_scopes = r_scope.select(ingredients_path)
            ingredients = []
            for i_scope in ingredient_scopes:
                amount = i_scope.select(
                    '//*[@class="ingredient-n"]/text()').extract()
                ingredient_unit = i_scope.select(
                    '*//*[@class="ingredient-unit"]/text()').extract()
                name = i_scope.select(
                    '//*[@class="ingredient-name"]/text()').extract()
                amount = "".join(amount).strip()
                ingredient_unit = "".join(ingredient_unit).strip()
                name = "".join(name).strip()
                ingredients.append("%s %s" % (amount, ingredient_unit, name))
            il.add_value('ingredients', ingredients)

            il.add_value('datePublished',
                         r_scope.select(datePublished).extract())

            recipes.append(il.load_item())

        return recipes
    def parse_item(self, response):

        # we use this to run XPath commands against the HTML in the response
        hxs = HtmlXPathSelector(response)

        # this is the base XPath string for the element that contains the
        # recipe info
        base_path = """//div[@itemtype="http://schema.org/Recipe"]"""

        # the select() method will return a list of HtmlXPathSelector objects.
        # On this site we will almost certainly either get back just one, if
        # any exist on the page
        recipes_scopes = hxs.select(base_path)

        # it's easier to define these XPath strings outside of the loop below
        name_path = '//div[@itemprop="name"]/text() | //*[@itemprop="name"]//*[@class="fn"]/text()'
        description_path = '//div[@itemprop="description"]/text()'
        image_path = '//img[1]/@src'
        prepTime_path = '//time[@itemprop="prepTime"][contains(@datetime, "PT")]/@datetime | //time[@itemprop="prepTime"]//*[@class="value-title"]/@title'
        cookTime_path = '//time[@itemprop="cookTime"][contains(@datetime, "PT")]/@datetime | //time[@itemprop="cookTime"]//*[@class="value-title"]/@title'
        recipeYield_path = '//span[@itemprop="recipeYield"]/text()'
        ingredients_path = '//li[@itemprop="ingredients"]/text()'
        datePublished = '//abbr[@class="published"]/text()'

        # init an empty list
        recipes = []

        # loop through our recipe scopes and extract the recipe data from each
        for r_scope in recipes_scopes:
            # make an empty RecipeItem
            il = RecipeItemLoader(item=RecipeItem())

            il.add_value('source', self.source)

            il.add_value('name', r_scope.select(name_path).extract())

            # There's a bunch of images for each recipe, so we just
            # grab the first.
            il.add_value('image', r_scope.select(image_path).extract()[1])
            il.add_value('url', response.url)
            il.add_value('description',
                         r_scope.select(description_path).extract())
            il.add_value('prepTime', r_scope.select(prepTime_path).extract())
            il.add_value('cookTime', r_scope.select(cookTime_path).extract())
            il.add_value('recipeYield',
                         r_scope.select(recipeYield_path).extract())
            il.add_value('ingredients',
                         r_scope.select(ingredients_path).extract())
            il.add_value('datePublished',
                         r_scope.select(datePublished).extract())

            # stick this RecipeItem in the array of recipes we will return
            recipes.append(il.load_item())

        # more processing is done by the openrecipes.pipelines. Look at that
        # file to see transforms that are applied to each RecipeItem
        return recipes
示例#8
0
    def parse_item(self, response):
        # skip review pages, which are hard to distinguish from recipe pages
        # in the link extractor regex
        if response.url.endswith('/review'):
            return []

        raw_recipes = parse_recipes(response, {u'source': self.source, 'url': response.url})

        return [RecipeItem.from_dict(recipe) for recipe in raw_recipes]
    def parse_item(self, response):

        hxs = HtmlXPathSelector(response)
        data = {'url': response.url, 'source': self.source}
        recipe = RecipeItem.from_dict(parse_recipe(hxs, data))
        loader = RecipeItemLoader(item=recipe)
        loader.add_value('image', select_class(hxs, 'post_image').select('@src').extract())
        loader.add_value('description', hxs.select('//meta[@name="description"]/@content').extract())
        loader.add_value('name', select_class(hxs, 'entry-title').select('text()').extract())
        return loader.load_item()
    def parse_item(self, response):

        hxs = HtmlXPathSelector(response)
        data = {"url": response.url, "source": self.source}
        recipe = RecipeItem.from_dict(parse_recipe(hxs, data))
        loader = RecipeItemLoader(item=recipe)
        loader.add_value("image", select_class(hxs, "post_image").select("@src").extract())
        loader.add_value("description", hxs.select('//meta[@name="description"]/@content').extract())
        loader.add_value("name", select_class(hxs, "entry-title").select("text()").extract())
        return [loader.load_item()]
示例#11
0
    def parse_item(self, response):

        # we use this to run XPath commands against the HTML in the response
        hxs = HtmlXPathSelector(response)

        # this is the base XPath string for the element that contains the recipe
        # info
        base_path = """//*[@id="container"]/*[@class="onepage"]/div[1]/div[@class="content"]"""

        # the select() method will return a list of HtmlXPathSelector objects.
        # On this site we will almost certainly either get back just one, if
        # any exist on the page
        recipes_scopes = hxs.select(base_path)

        # it's easier to define these XPath strings outside of the loop below
        name_path = '//h1[@class="title"]/text() | //*[@class="content"]/p[@style="text-align: center;"]/following-sibling::p[strong]/strong/text()'
        image_path = '//*[@class="content"]/p[1]/img[contains(@class, "size-full")]/@src'
        recipeYield_path = '//*[@class="content"]/p[@style="text-align: center;"]/following-sibling::p[em and strong]/em/text()'
        datePublished = '//*[@class="phn-date"]/a[@rel="author"]/following-sibling::text()'

        # This site contains Ingredients and Garnishes, both "lists" are inside a <p> and separated
        # using <br>s. Also, we skip the <p> containing "EVENT VENUE PARTY SIZE TYPE MENU" by
        # grabbing <p>s that do not have <strong>, <a>, or <img> child elements
        ingredients_path = '//*[@class="content"]/p[not(strong or a or img) and br]/text()'

        # init an empty list
        recipes = []

        # loop through our recipe scopes and extract the recipe data from each
        for r_scope in recipes_scopes:
            # make an empty RecipeItem
            il = RecipeItemLoader(item=RecipeItem())

            il.add_value('source', self.source)

            il.add_value('name', r_scope.select(name_path).extract())
            il.add_value('image', r_scope.select(image_path).extract())
            il.add_value('url', response.url)

            il.add_value('recipeYield', r_scope.select(recipeYield_path).extract())

            # date returns something like this: "ON SATURDAY NOV 28TH, 2009 |"
            date = r_scope.select(datePublished).extract()
            if len(date) > 0:
                date = date[0].replace('on', '', 1).replace('|', '').strip()
                il.add_value('datePublished', date)

            il.add_value('ingredients', r_scope.select(ingredients_path).extract())

            # stick this RecipeItem in the array of recipes we will return
            recipes.append(il.load_item())

        # more processing is done by the openrecipes.pipelines. Look at that
        # file to see transforms that are applied to each RecipeItem
        return recipes
示例#12
0
    def parse_item(self, response):
        if '/ingredients/' in response.url or '/category/' in response.url:
            return []

        hxs = HtmlXPathSelector(response)

        base_path = '//div[@class="blog"]'

        recipes_scopes = hxs.select(base_path)

        name_path = 'h1/a[@rel="bookmark"]/text()'
        description_path = '//meta[@property="og:description"]/@content'
        image_path = '//meta[@property="og:image"][1]/@content'
        recipeYield_path = './/*[@class="yield"]//text()[normalize-space()]'
        ingredients_path = './/*[@class="ingredient"]'
        datePublished = '//div[@class="blurb"]/strong/text()[1]'

        recipes = []

        for r_scope in recipes_scopes:
            il = RecipeItemLoader(item=RecipeItem())

            il.add_value('source', self.source)

            il.add_value('name', r_scope.select(name_path).extract())
            il.add_value('image', r_scope.select(image_path).extract())
            il.add_value('url', response.url)
            il.add_value('description',
                         r_scope.select(description_path).extract())

            il.add_value('recipeYield',
                         ' '.join(r_scope.select(recipeYield_path).extract()))

            ingredient_scopes = r_scope.select(ingredients_path)
            ingredients = []
            for ingredient_node in ingredient_scopes:
                ingredient = [
                    i.strip() for i in ingredient_node.select(
                        './/text()[normalize-space()]').extract()
                ]
                ingredients.append(' '.join(ingredient))

            il.add_value('ingredients', ingredients)

            datePublished = r_scope.select(datePublished).extract()[0]
            il.add_value(
                'datePublished',
                datePublished.replace('Posted on', '').replace('in',
                                                               '').strip())

            recipes.append(il.load_item())

        return recipes
示例#13
0
    def parse_item(self, response):
        # skip review pages, which are hard to distinguish from recipe pages
        # in the link extractor regex
        if response.url.endswith('/review'):
            return []

        raw_recipes = parse_recipes(response, {
            u'source': self.source,
            'url': response.url
        })

        return [RecipeItem.from_dict(recipe) for recipe in raw_recipes]
示例#14
0
    def parse_item(self, response):

        hxs = HtmlXPathSelector(response)

        base_path = '//*[@class="hrecipe"]'

        recipes_scopes = hxs.select(base_path)

        name_path = '//h1[@class="fn"]/text()'
        #long descriptions used in website, also the description doesn't appear..
        description_path = '//*[@class="format_text entry-content jpibfi_container"]/p/text()'
        #the end image url contains dimensions 150x150, not sure how to remove.
        image_path = '//*[@class="photo"]/@src'
        #prepTime_path = 'TODO'   None given
        #cookTime_path = 'TODO'   None given
        #recipeYield_path = 'TODO'None given
        ingredients_path = './/*[@class="ingredient"]/p/text()'
        datePublished = '//span[@class="published"]/text()'

        recipes = []

        label_regex = re.compile(r'^For ')

        for r_scope in recipes_scopes:
            il = RecipeItemLoader(item=RecipeItem())

            il.add_value('source', self.source)

            il.add_value('name', r_scope.select(name_path).extract())
            il.add_value('image', r_scope.select(image_path).extract())
            il.add_value('url', response.url)
            il.add_value('description',
                         r_scope.select(description_path).extract())

            #il.add_value('prepTime', r_scope.select(prepTime_path).extract())
            #il.add_value('cookTime', r_scope.select(cookTime_path).extract())
            #il.add_value('recipeYield', r_scope.select(recipeYield_path).extract())

            ingredient_scopes = r_scope.select(ingredients_path)
            ingredients = []
            for i_scope in ingredient_scopes:
                ingredient = i_scope.extract().strip()
                if not label_regex.match(
                        ingredient) and not ingredient.endswith(':'):
                    ingredients.append(ingredient)
            il.add_value('ingredients', ingredients)

            il.add_value('datePublished',
                         r_scope.select(datePublished).extract())

            recipes.append(il.load_item())

        return recipes
    def parse_item(self, response):

        hxs = HtmlXPathSelector(response)

        base_path = '//*[@class="recipe hrecipe"]'

        recipes_scopes = hxs.select(base_path)

        name_path = './/*[@class="fn"]/text()'
        #desription is pretty odd on this site.
        #description_path = 'TODO'
        image_path = '//div/p[1]//img/@src'
        prepTime_path = '//*[@class="preptime"]/text()'
        cookTime_path = '//*[@class="cooktime"]/text()'
        recipeYield_path = '//*[@class="yield"]/text()'
        ingredients_path = './/div[@class="ingredient"]/p/text()'
        #the formatting is odd, will need to learn more xpath to be able to select, just date
        datePublished = '//*[@class="postmeta"]/text()'

        recipes = []

        label_regex = re.compile(r'^For ')

        for r_scope in recipes_scopes:
            il = RecipeItemLoader(item=RecipeItem())

            il.add_value('source', self.source)

            il.add_value('name', r_scope.select(name_path).extract())
            il.add_value('image', r_scope.select(image_path).extract())
            il.add_value('url', response.url)
            #il.add_value('description', r_scope.select(description_path).extract())

            il.add_value('prepTime', r_scope.select(prepTime_path).extract())
            il.add_value('cookTime', r_scope.select(cookTime_path).extract())
            il.add_value('recipeYield',
                         r_scope.select(recipeYield_path).extract())

            ingredient_scopes = r_scope.select(ingredients_path)
            ingredients = []
            for i_scope in ingredient_scopes:
                ingredient = i_scope.extract().strip()
                if not label_regex.match(
                        ingredient) and not ingredient.endswith(':'):
                    ingredients.append(ingredient)
            il.add_value('ingredients', ingredients)

            il.add_value('datePublished',
                         r_scope.select(datePublished).extract())

            recipes.append(il.load_item())

        return recipes
示例#16
0
    def parse_item(self, response):

        # we use this to run XPath commands against the HTML in the response
        hxs = HtmlXPathSelector(response)

        # this is the base XPath string for the element that contains the recipe
        # info
        base_path = """//span[@class="hrecipe"]"""

        # the select() method will return a list of HtmlXPathSelector objects.
        # On this site we will almost certainly either get back just one, if
        # any exist on the page
        recipes_scopes = hxs.select(base_path)

        # it's easier to define these XPath strings outside of the loop below
        name_path = '//div[@class="content"]/header/h1[@class="fn"]/text()'
        description_path = '//article[@class="recipe_description"]//text()'
        image_path = '//div[@class="recipe_image_main"]/p/img/@src'
        recipeYield_path = '//div[@class="recipe_meta"]/p/span[contains(@class,"yield")]/text()'
        ingredients_path = '//article[@class="ingredients"]//ul//li/p[@class="ingredient"]/span[@class="value"]/text()'

        # init an empty list
        recipes = []

        # loop through our recipe scopes and extract the recipe data from each
        for r_scope in recipes_scopes:
            # make an empty RecipeItem
            il = RecipeItemLoader(item=RecipeItem())

            il.add_value('source', self.source)

            il.add_value('name', r_scope.select(name_path).extract())
            il.add_value('image', urljoin(response.url, r_scope.select(image_path).extract().pop(0)))
            il.add_value('url', response.url)
            il.add_value('description', r_scope.select(description_path).extract())

            # prepTime not available
            il.add_value('prepTime', None)
            # cookTime not available
            il.add_value('cookTime', None)
            il.add_value('recipeYield', r_scope.select(recipeYield_path).extract())
            il.add_value('ingredients', r_scope.select(ingredients_path).extract())

            # datePublished not available
            il.add_value('datePublished', None)

            # stick this RecipeItem in the array of recipes we will return
            recipes.append(il.load_item())

        # more processing is done by the openrecipes.pipelines. Look at that
        # file to see transforms that are applied to each RecipeItem
        return recipes
示例#17
0
    def parse_item(self, response):

        hxs = HtmlXPathSelector(response)

        base_path = '//*[@id="recipe"]'

        recipes_scopes = hxs.select(base_path)

        name_path = '//*[@class="row page_title clearfix"]/h2/text()'
        description_path = '//*[@class="entry"]/p//text()'
        image_path = '//*[@class="featured_image"]/img[@class="image"]/@src'
        recipeYield_path = '//*[@class="breakdown"]/tbody/tr[1]/td[1]/text()'
        ingredients_path = '*//*[@class="ingredients"]'
        #the site only offers total time, so prep and cook is combined
        #prepTime_path = ''
        # timezone warning, that is over my head at this point
        #cookTime_path = '//*[@class="cook_time"]'
        # datePublished = 'TODO' not available

        recipes = []

        for r_scope in recipes_scopes:
            il = RecipeItemLoader(item=RecipeItem())

            #il.add_value('source', self.source)

            il.add_value('name', r_scope.select(name_path).extract())
            il.add_value('image', r_scope.select(image_path).extract())
            il.add_value('url', response.url)
            il.add_value('description',
                         r_scope.select(description_path).extract())

            # il.add_value('prepTime', r_scope.select(prepTime_path).extract())
            #il.add_value('cookTime', r_scope.select(cookTime_path).extract())
            il.add_value('recipeYield',
                         r_scope.select(recipeYield_path).extract())
            #il.add_value('ingredients', r_scope.select(ingredients_path).extract())
            ingredient_scopes = r_scope.select(ingredients_path)
            ingredients = []
            for i_scope in ingredient_scopes:
                amount = i_scope.select('//td/strong').extract()
                name = i_scope.select(
                    '//*[@class="ingredients"]/tbody/tr/td/text()').extract()
                amount = "".join(amount).strip()
                name = "".join(name).strip()
                ingredients.append("%s %s" % (amount, name))
            il.add_value('ingredients', ingredients)
            # il.add_value('datePublished', r_scope.select(datePublished).extract())

            recipes.append(il.load_item())

        return recipes
    def parse_item(self, response):

        hxs = HtmlXPathSelector(response)

        base_path = '//*[@class="recipe hrecipe"]'

        recipes_scopes = hxs.select(base_path)

        name_path = '//*[@class="fn"]/text()'
        description_path = '//*[@class="summary"]/p/text()'
        image_path = '//p[1]/span/img/@src'
        prepTime_path = '//*[@class="preptime"]/text()'
        cookTime_path = './/*[@class="cooktime"]/text()'
        recipeYield_path = '//*[@class="yield"]/text()'
        ingredients_path = './/*[@class="ingredient"]/p/text()'
        #same formatting as forthelovecooking, so kind of odd.
        #datePublished = 'TODO'

        recipes = []

        label_regex = re.compile(r'^For ')

        for r_scope in recipes_scopes:
            il = RecipeItemLoader(item=RecipeItem())

            il.add_value('source', self.source)

            il.add_value('name', r_scope.select(name_path).extract())
            il.add_value('image', r_scope.select(image_path).extract())
            il.add_value('url', response.url)
            il.add_value('description',
                         r_scope.select(description_path).extract())

            il.add_value('prepTime', r_scope.select(prepTime_path).extract())
            il.add_value('cookTime', r_scope.select(cookTime_path).extract())
            il.add_value('recipeYield',
                         r_scope.select(recipeYield_path).extract())

            ingredient_scopes = r_scope.select(ingredients_path)
            ingredients = []
            for i_scope in ingredient_scopes:
                ingredient = i_scope.extract().strip()
                if not label_regex.match(
                        ingredient) and not ingredient.endswith(':'):
                    ingredients.append(ingredient)
            il.add_value('ingredients', ingredients)

            #il.add_value('datePublished', r_scope.select(datePublished).extract())

            recipes.append(il.load_item())

        return recipes
    def parse_item(self, response):
        hxs = HtmlXPathSelector(response)

        base_path = """//div[@id="recipe"]"""

        recipes_scopes = hxs.select(base_path)

        name_path = 'h1/text()'
        description_path = '//meta[@property="og:description"]/@content'
        url_path = '//meta[@property="og:url"]/@content'
        image_path = '//meta[@property="og:image"][1]/@content'
        prepTime_path = './/span[@class="preptime"]/span[@class="value-title"]/@title'
        cookTime_path = './/span[@class="cooktime"]/span[@class="value-title"]/@title'

        # super inconsistent in how the yield is formatted
        recipeYield_path = "|".join([
            '//div[@id="recipe"]/p[starts-with(i,"Makes")]/i',
            '//div[@id="recipe"]/p[starts-with(i,"Serves")]/i',
            '//div[@id="recipe"]/p[starts-with(em,"Makes")]/em',
            '//div[@id="recipe"]/p[starts-with(em,"Serves")]/em',
            '//div[@id="recipe"][starts-with(p,"Makes")]/p',
            '//div[@id="recipe"][starts-with(p,"Serves")]/p',
        ])
        ingredients_path = 'blockquote/*'
        datePublished = '//span[@class="published"]/span[@class="value-title"]/@title'

        recipes = []
        for r_scope in recipes_scopes:
            il = RecipeItemLoader(item=RecipeItem())
            il.add_value('source', self.source)

            il.add_value('name', r_scope.select(name_path).extract())
            il.add_value('image', r_scope.select(image_path).extract())
            il.add_value('url', r_scope.select(url_path).extract())
            il.add_value('description',
                         r_scope.select(description_path).extract())

            il.add_value('prepTime', r_scope.select(prepTime_path).extract())
            il.add_value('cookTime', r_scope.select(cookTime_path).extract())
            il.add_value('recipeYield',
                         r_scope.select(recipeYield_path).extract())

            il.add_value('ingredients',
                         r_scope.select(ingredients_path).extract())

            il.add_value('datePublished',
                         r_scope.select(datePublished).extract())

            recipes.append(il.load_item())

        return recipes
示例#20
0
    def parse_item(self, response):
        # skip review pages, which are hard to distinguish from recipe pages
        # in the link extractor regex
        if response.url.endswith('/review'):
            return []

        hxs = HtmlXPathSelector(response)
        raw_recipes = parse_recipes(hxs, {'source': self.source})
        for recipe in raw_recipes:
            if 'photo' in recipe:
                recipe['photo'] = flatten(recipe['photo'])
            if 'image' in recipe:
                recipe['image'] = flatten(recipe['image'])

        return [RecipeItem.from_dict(recipe) for recipe in raw_recipes]
示例#21
0
    def parse_item(self, response):
        # skip review pages, which are hard to distinguish from recipe pages
        # in the link extractor regex
        if response.url.endswith('/review'):
            return []

        hxs = HtmlXPathSelector(response)
        raw_recipes = parse_recipes(hxs, {'source': self.source})
        for recipe in raw_recipes:
            if 'photo' in recipe:
                recipe['photo'] = flatten(recipe['photo'])
            if 'image' in recipe:
                recipe['image'] = flatten(recipe['image'])

        return [RecipeItem.from_dict(recipe) for recipe in raw_recipes]
示例#22
0
    def parse_item(self, response):

        hxs = HtmlXPathSelector(response)

        base_path = '//div[@class="innerrecipe"]'

        recipes_scopes = hxs.select(base_path)

        name_path = '*//h2[@class="fn"]/text()'
        image_path = '*//img[@class="photo"]/@src'
        prepTime_path = '*//span[@class="preptime"]/text()'
        cookTime_path = '*//span[@class="cooktime"]/text()'
        totalTime_path = '*//span[@class="duration"]/text()'
        recipeYield_path = '*//span[@class="yield"]/text()'
        datePublished = '//div[@class="post fullpost singlepost"]//div[@class="postmeta"]/text()[normalize-space()]'
        ingredients_path = '*//*[@class="ingredient"]/p'

        recipes = []

        for r_scope in recipes_scopes:
            il = RecipeItemLoader(item=RecipeItem())

            il.add_value('source', self.source)

            il.add_value('name', r_scope.select(name_path).extract())
            il.add_value('image', r_scope.select(image_path).extract())
            il.add_value('url', response.url)

            il.add_value('prepTime', r_scope.select(prepTime_path).extract())
            il.add_value('cookTime', r_scope.select(cookTime_path).extract())
            il.add_value('totalTime', r_scope.select(totalTime_path).extract())
            il.add_value('recipeYield',
                         r_scope.select(recipeYield_path).extract())

            ingredient_scopes = r_scope.select(ingredients_path)
            ingredients = []
            for i_scope in ingredient_scopes:
                ingredient = i_scope.select('text()').extract()
                ingredient = "".join(ingredient)
                ingredients.append(ingredient)
            il.add_value('ingredients', ingredients)

            il.add_value('datePublished',
                         r_scope.select(datePublished).extract())

            recipes.append(il.load_item())

        return recipes
示例#23
0
    def parse_item(self, response):
        hxs = HtmlXPathSelector(response)

        # site has many recipes missing the semantmic markup, but not worth
        # pursuing those IMHO. use hrecipe
        base_path = """//*[@class="hrecipe"]"""

        recipes_scopes = hxs.select(base_path)

        name_path = './/*[@class="fn"]/text()'
        url_path = '//meta[@property="og:url"]/@content'
        image_path = '//meta[@property="og:image"][1]/@content'
        recipeYield_path = './/*[@class="yield"]/text()'
        ingredients_path = '*//*[@class="ingredient"]'

        # get the date from rest of page, not under hrecipe
        datePublished_path = '//*[@class="date"][1]'

        recipes = []

        for r_scope in recipes_scopes:
            il = RecipeItemLoader(item=RecipeItem())

            il.add_value('source', self.source)

            il.add_value('name', r_scope.select(name_path).extract())
            il.add_value('image', r_scope.select(image_path).extract())
            il.add_value('url', r_scope.select(url_path).extract())

            il.add_value('recipeYield',
                         r_scope.select(recipeYield_path).extract())

            ingredient_scopes = r_scope.select(ingredients_path)
            ingredients = []
            for i_scope in ingredient_scopes:
                amount = i_scope.select('*[@class="amount"]/text()').extract()
                name = i_scope.select('*[@class="name"]/text()').extract()
                amount = "".join(amount).strip()
                name = "".join(name).strip()
                ingredients.append("%s %s" % (amount, name))
            il.add_value('ingredients', ingredients)

            il.add_value('datePublished',
                         r_scope.select(datePublished_path).extract())

            recipes.append(il.load_item())

        return recipes
示例#24
0
    def parse_item(self, response):

        hxs = HtmlXPathSelector(response)
        data = {'url': response.url, 'source': self.source}
        recipe = RecipeItem.from_dict(parse_recipe(hxs, data))
        loader = RecipeItemLoader(item=recipe)
        loader.add_value(
            'image',
            select_class(hxs, 'post_image').select('@src').extract())
        loader.add_value(
            'description',
            hxs.select('//meta[@name="description"]/@content').extract())
        loader.add_value(
            'name',
            select_class(hxs, 'entry-title').select('text()').extract())
        return [loader.load_item()]
示例#25
0
    def parse_item(self, response):

        hxs = HtmlXPathSelector(response)

        base_path = 'TODO'

        recipes_scopes = hxs.select(base_path)

        name_path = 'TODO'
        description_path = 'TODO'
        image_path = 'TODO'
        prepTime_path = 'TODO'
        cookTime_path = 'TODO'
        recipeYield_path = 'TODO'
        ingredients_path = 'TODO'
        datePublished = 'TODO'

        recipes = []

        for r_scope in recipes_scopes:
            il = RecipeItemLoader(item=RecipeItem())

            il.add_value('source', self.source)

            il.add_value('name', r_scope.select(name_path).extract())
            il.add_value('image', r_scope.select(image_path).extract())
            il.add_value('url', response.url)
            il.add_value('description',
                         r_scope.select(description_path).extract())

            il.add_value('prepTime', r_scope.select(prepTime_path).extract())
            il.add_value('cookTime', r_scope.select(cookTime_path).extract())
            il.add_value('recipeYield',
                         r_scope.select(recipeYield_path).extract())

            ingredient_scopes = r_scope.select(ingredients_path)
            ingredients = []
            for i_scope in ingredient_scopes:
                pass
            il.add_value('ingredients', ingredients)

            il.add_value('datePublished',
                         r_scope.select(datePublished).extract())

            recipes.append(il.load_item())

        return recipes
示例#26
0
    def parse_item(self, response):
        hxs = HtmlXPathSelector(response)

        base_path = '//*[@itemtype="http://schema.org/Recipe"]'

        recipes_scopes = hxs.select(base_path)

        name_path = '//*[@itemprop="name"]/text()'
        description_path = '//*[@itemprop="description"]/text()'
        url_path = '//meta[@property="og:url"]/@content'
        image_path = '//*[@itemprop="image"]/@src'
        recipeYield_path = '*//*[@itemprop="recipeYield"]/text()'

        prepTime_path = '//*[@itemprop="prepTime"]'
        cookTime_path = '//*[@itemprop="cookTime"]'

        ingredients_path = '//*[@itemprop="ingredients"]'

        recipes = []
        for r_scope in recipes_scopes:
            il = RecipeItemLoader(item=RecipeItem())
            il.add_value('source', 'allrecipes')
            il.add_value('name', r_scope.select(name_path).extract())
            il.add_value('image', r_scope.select(image_path).extract())
            il.add_value('url', r_scope.select(url_path).extract())
            il.add_value('description',
                         r_scope.select(description_path).extract())

            prepTime = r_scope.select(prepTime_path)
            il.add_value('prepTime', parse_iso_date(prepTime))

            cookTime = r_scope.select(cookTime_path)
            il.add_value('cookTime', parse_iso_date(cookTime))
            il.add_value('recipeYield',
                         r_scope.select(recipeYield_path).extract())

            ingredient_scopes = r_scope.select(ingredients_path)
            ingredients = []
            for i_scope in ingredient_scopes:
                components = i_scope.select('node()/text()').extract()
                ingredients.append(' '.join(components))

            il.add_value('ingredients', ingredients)

            recipes.append(il.load_item())

        return recipes
示例#27
0
    def parse_item(self, response):
        hxs = HtmlXPathSelector(response)

        base_path = """//div[@id="blq-main"]"""

        recipes_scopes = hxs.select(base_path)

        name_path = '//h1/text()'
        description_path = '//div[@id="description"]//span[@class="summary"]/text()'
        image_path = '//img[@id="food-image"]/@src'
        prepTime_path = '//span[@class="prepTime"]/span[@class="value-title"]/@title'
        cookTime_path = '//span[@class="cookTime"]/span[@class="value-title"]/@title'
        recipeYield_path = '//h3[@class="yield"]/text()'
        ingredients_path = '//p[@class="ingredient"]'

        recipes = []

        for r_scope in recipes_scopes:
            il = RecipeItemLoader(item=RecipeItem())

            il.add_value('source', self.source)

            il.add_value('name', r_scope.select(name_path).extract())
            il.add_value('image', r_scope.select(image_path).extract())
            il.add_value('url', response.url)
            il.add_value('description',
                         r_scope.select(description_path).extract())

            il.add_value('prepTime', r_scope.select(prepTime_path).extract())
            il.add_value('cookTime', r_scope.select(cookTime_path).extract())
            il.add_value('recipeYield',
                         r_scope.select(recipeYield_path).extract())

            ingredient_scopes = r_scope.select(ingredients_path)
            ingredients = []
            for i_scope in ingredient_scopes:
                amount = i_scope.select('text()[1]').extract()
                name = i_scope.select('a/text()').extract()
                amount = "".join(amount).strip()
                name = "".join(name).strip()
                ingredients.append("%s %s" % (amount, name))
            il.add_value('ingredients', ingredients)

            recipes.append(il.load_item())

        return recipes
示例#28
0
def clean_item(old_dict):
    # copy this so we have an unmodified version
    source_dict = dict(old_dict)
    # remove ts and _id fields from what we pass to loader
    del source_dict['ts']
    del source_dict['_id']

    if VERBOSE:
        print "Examining '%s' from '%s' (%s)..." % (
            old_dict['name'], old_dict['source'], old_dict['_id'])

    loader = RecipeItemLoader(RecipeItem())
    for k, v in source_dict.iteritems():
        loader = set_value(loader, k, v)

    new_item = loader.load_item()
    return new_item, source_dict
示例#29
0
  def parse_item(self, response):
    hxs = HtmlXPathSelector(response)
    raw_recipes = self.parse_recipes(hxs, {'source': self.source, 'url': response.url})
    for recipe in raw_recipes:
      if 'photo' in recipe:
        photo_url = flatten(recipe['photo'])
        if photo_url.startswith('//'):
          photo_url = 'http:' + photo_url
        recipe['photo'] = photo_url

      if 'image' in recipe:
        photo_url = flatten(recipe['image'])
        if photo_url.startswith('//'):
          photo_url = 'http:' + photo_url
        recipe['image'] = photo_url

    return [RecipeItem.from_dict(recipe) for recipe in raw_recipes]
示例#30
0
    def parse_item(self, response):
        # skip review pages, which are hard to distinguish from recipe pages
        # in the link extractor regex
        if '/reviews/' in response.url:
            return []

        hxs = HtmlXPathSelector(response)
        raw_recipes = parse_recipes(hxs, {'source': self.source, 'url': response.url})
        for recipe in raw_recipes:
            if 'photo' in recipe:
                recipe['photo'] = flatten(recipe['photo'])
                recipe['photo'] = recipe['photo'].replace('_med.', '_lg.')
            if 'image' in recipe:
                recipe['image'] = flatten(recipe['image'])
                recipe['image'] = recipe['image'].replace('_med.', '_lg.')

        return [RecipeItem.from_dict(recipe) for recipe in raw_recipes]
示例#31
0
    def parse_item(self, response):

        hxs = HtmlXPathSelector(response)
        base_path = """//div[@id="zlrecipe-innerdiv"]"""
        recipes_scopes = hxs.select(base_path)

        name_path = '*//*[@itemprop="name"]/text()'
        url_path = '//link[@rel="canonical"]/@href'
        image_path = '//meta[@property="og:image"][1]/@content'

        prepTime_path = '*//*[@itemprop="prepTime"]/@content'
        cookTime_path = '*//*[@itemprop="cookTime"]/@content'
        recipeYield_path = '*//*[@itemprop="recipeYield"]/text()'

        ingredients_path = '*//*[@itemprop="ingredients"]'
        datePublished = '//*[@class="time_stamp_month"]'

        recipes = []

        for r_scope in recipes_scopes:
            il = RecipeItemLoader(item=RecipeItem())

            il.add_value('source', self.source)

            il.add_value('name', r_scope.select(name_path).extract())
            il.add_value('image', r_scope.select(image_path).extract())
            il.add_value('url', r_scope.select(url_path).extract())

            il.add_value('prepTime', r_scope.select(prepTime_path).extract())
            il.add_value('cookTime', r_scope.select(cookTime_path).extract())
            il.add_value('recipeYield',
                         r_scope.select(recipeYield_path).extract())

            ingredient_scopes = r_scope.select(ingredients_path)
            ingredients = []
            for i_scope in ingredient_scopes:
                ind = i_scope.select('.//text()').extract()
                ingredients.append(''.join(ind).strip())
            il.add_value('ingredients', ingredients)

            il.add_value('datePublished',
                         r_scope.select(datePublished).extract())

            recipes.append(il.load_item())

        return recipes
示例#32
0
    def parse_item(self, response):
        # skip review pages, which are hard to distinguish from recipe pages
        # in the link extractor regex
        if '/reviews/' in response.url:
            return []

        hxs = HtmlXPathSelector(response)
        raw_recipes = parse_recipes(hxs, {'source': self.source, 'url': response.url})
        for recipe in raw_recipes:
            if 'photo' in recipe:
                recipe['photo'] = flatten(recipe['photo'])
                recipe['photo'] = recipe['photo'].replace('_med.', '_lg.')
            if 'image' in recipe:
                recipe['image'] = flatten(recipe['image'])
                recipe['image'] = recipe['image'].replace('_med.', '_lg.')

        return [RecipeItem.from_dict(recipe) for recipe in raw_recipes]
示例#33
0
    def parse_item(self, response):
        hxs = HtmlXPathSelector(response)

        base_path = """//*[contains(@class,'hrecipe')]"""

        recipes_scopes = hxs.select(base_path)

        name_path = '//meta[@property="og:title"]/@content'
        description_path = '//meta[@name="description"]/@content'
        url_path = '//meta[@property="og:url"]/@content'
        image_path = '//*[@itemprop="image"]/@src'
        recipeYield_path = '//div[@class="time-and-yield"]/*/span[@class="yield"]/text()'
        ingredients_path = '//ul[@class="ingredients"]/li/span[@class="ingredient"]'
        datePublished_path = '//div[@class="intro"]/div[@class="display-date"]/text()[last()]'  # skip HTML comment

        recipes = []
        for r_scope in recipes_scopes:
            il = RecipeItemLoader(item=RecipeItem())

            il.add_value('source', self.source)
            il.add_value('name', r_scope.select(name_path).extract())
            il.add_value('image', r_scope.select(image_path).extract())
            il.add_value('url', r_scope.select(url_path).extract())
            il.add_value('description',
                         r_scope.select(description_path).extract())
            il.add_value('recipeYield',
                         r_scope.select(recipeYield_path).extract())

            ingredients_scope = r_scope.select(ingredients_path)
            ingredients = []
            for i_scope in ingredients_scope:
                quantity = i_scope.select(
                    'span[@class="quantity"]/text()').extract()
                name = i_scope.select('span[@class="name"]/text()').extract()
                quantity = "".join(quantity).strip()
                name = "".join(name).strip()
                ingredients.append("%s %s" % (quantity, name))
            il.add_value('ingredients', ingredients)

            il.add_value('datePublished',
                         r_scope.select(datePublished_path).extract())

            recipes.append(il.load_item())

        return recipes
    def parse_item(self, response):

        hxs = HtmlXPathSelector(response)

        base_path = '//*[@id="recipe"]'

        recipes_scopes = hxs.select(base_path)

        name_path = '//*[@class="row page_title clearfix"]/h2/text()'
        description_path = '//*[@class="entry"]/p//text()'
        image_path = '//*[@class="featured_image"]/img[@class="image"]/@src'
        recipeYield_path = '//*[@class="breakdown"]/tbody/tr[1]/td[1]/text()'
        ingredients_path = '//*[@class="ingredients"]/tr'

        recipes = []

        for r_scope in recipes_scopes:
            il = RecipeItemLoader(item=RecipeItem())

            il.add_value('source', self.source)

            il.add_value('name', r_scope.select(name_path).extract())
            il.add_value('image', r_scope.select(image_path).extract())
            il.add_value('url', response.url)
            il.add_value('description',
                         r_scope.select(description_path).extract())

            il.add_value('recipeYield',
                         r_scope.select(recipeYield_path).extract())

            # this gives us a list of TRs
            ingredient_scopes = r_scope.select(ingredients_path)
            ingredients = []

            # iterate over each TR scope and extract out the TDs + combine
            # the HTML will stripped in the pipeline
            for i_scope in ingredient_scopes:
                ingr_row = i_scope.select('td').extract()
                ingredient_str = " ".join(ingr_row).strip()
                ingredients.append(ingredient_str)
            il.add_value('ingredients', ingredients)

            recipes.append(il.load_item())

        return recipes
示例#35
0
    def parse_item(self, response):

        hxs = HtmlXPathSelector(response)

        base_path = '//div[@id="content"]'

        recipes_scopes = hxs.select(base_path)

        name_path = './/span[@class="item"]/h2[@class="fn"]/text()'
        image_path = "descendant-or-self::img[@class and contains(concat(' ', normalize-space(@class), ' '), ' size-full ')][1]/@src"
        prepTime_path = './/span[@class="preptime"]/text()'
        cookTime_path = './/span[@class="cooktime"]/text()'
        recipeYield_path = './/span[@class="yield"]/text()'
        ingredients_path = './/div[@class="ingredient"]/p/text()'

        recipes = []

        label_regex = re.compile(r'^For ')

        for r_scope in recipes_scopes:
            il = RecipeItemLoader(item=RecipeItem())

            il.add_value('source', self.source)

            il.add_value('name', r_scope.select(name_path).extract())
            il.add_value('image', r_scope.select(image_path).extract())
            il.add_value('url', response.url)

            il.add_value('prepTime', r_scope.select(prepTime_path).extract())
            il.add_value('cookTime', r_scope.select(cookTime_path).extract())
            il.add_value('recipeYield',
                         r_scope.select(recipeYield_path).extract())

            ingredient_scopes = r_scope.select(ingredients_path)
            ingredients = []
            for i_scope in ingredient_scopes:
                ingredient = i_scope.extract().strip()
                if not label_regex.match(
                        ingredient) and not ingredient.endswith(':'):
                    ingredients.append(ingredient)
            il.add_value('ingredients', ingredients)

            recipes.append(il.load_item())

        return recipes
示例#36
0
    def parse_item(self, response):

        hxs = HtmlXPathSelector(response)

        base_path = '//article[@itemtype="http://data-vocabulary.org/Recipe"]'

        recipes_scopes = hxs.select(base_path)

        name_path = '//h1[@itemprop="name"]/text()'
        description_path = '//meta[@name="description"]/@content'
        image_path = '//img[@itemprop="photo"]/@src'
        prepTime_path = '//span[@itemprop="prepTime"]/text()'
        cookTime_path = '//span[@itemprop="cookTime"]/text()'
        recipeYield_path = '//span[@itemprop="yield"]/text()'
        ingredients_path = '//li[@itemprop="ingredient"]'
        ingredients_amounts_path = './span[@itemprop="amount"]/span/text()'
        ingredients_names_path = './span[@itemprop="name"]/text()'

        recipes = []

        for r_scope in recipes_scopes:
            il = RecipeItemLoader(item=RecipeItem())

            il.add_value('source', self.source)

            il.add_value('name', r_scope.select(name_path).extract())
            il.add_value('image', r_scope.select(image_path).extract())
            il.add_value('url', response.url)
            il.add_value('description', r_scope.select(description_path).extract())

            il.add_value('prepTime', r_scope.select(prepTime_path).extract())
            il.add_value('cookTime', r_scope.select(cookTime_path).extract())
            il.add_value('recipeYield', r_scope.select(recipeYield_path).extract())

            # then combine them into a string.
            ingredient_scopes = r_scope.select(ingredients_path)
            amount = ingredient_scopes.select(ingredients_amounts_path).extract()
            name = ingredient_scopes.select(ingredients_names_path).extract()
            ingredients = [" ".join(ing).encode('utf-8') for ing in zip(amount, name)]
            il.add_value('ingredients', ingredients)

            recipes.append(il.load_item())

        return recipes
示例#37
0
    def parse_item(self, response):

      hxs = HtmlXPathSelector(response)
      raw_recipes = parse_recipes(hxs, {'source': self.source, 'url': response.url})

      return [RecipeItem.from_dict(recipe) for recipe in raw_recipes]
示例#38
0
 def parse_item(self, response):
   raw_recipes = parse_recipes(response, {'source': self.source, 'url': response.url})
   return [RecipeItem.from_dict(recipe) for recipe in raw_recipes]