Пример #1
0
    def parse_item(self, response):

        hxs = HtmlXPathSelector(response)
        image_path = hxs.select("descendant-or-self::img[@class and contains(@class, 'wp-image')][1]/@data-lazy-src").extract()

        raw_recipes = parse_recipes(hxs, {'source': self.source, 'url': response.url})
        if raw_recipes:
            # schema.org.  Yay!
            for recipe in raw_recipes:
                recipe['image'] = image_path

            return [RecipeItem.from_dict(recipe) for recipe in raw_recipes]
        else:
            # not schema.org.  Boo!
            il = RecipeItemLoader(item=RecipeItem())

            il.add_value('source', self.source)
            il.add_value('url', response.url)
            il.add_value('image', image_path)

            name_path = '//*[@class="post-title"]/h1/text()'
            il.add_value('name', hxs.select(name_path).extract())
            # maybe it's in the P's
            for p in hxs.select('//div[@id="recipe" or @class="span9"]/p'):
                if is_ingredient_container(p):
                    il.add_value('ingredients', p.select('text()').extract())
            # or maybe it's in the LI's
            for li in hxs.select('//*[@class="span9"]//ul/li'):
                if is_ingredient_container(li):
                    il.add_value('ingredients', li.select('text()').extract())
            # or maybe it's in these other LI's
            for li in hxs.select('//li[@class="ingredient"]/text()'):
                il.add_value('ingredients', li.extract())
            return il.load_item()
    def parse_item(self, response):
        hxs = HtmlXPathSelector(response)
        raw_recipes = parse_recipes(hxs, {
            'source': self.source,
            'url': response.url
        })

        return [RecipeItem.from_dict(recipe) for recipe in raw_recipes]
Пример #3
0
    def parse_item(self, response):
        # skip review pages, which are hard to distinguish from recipe pages
        # in the link extractor regex
        if response.url.endswith('/review'):
            return []

        hxs = HtmlXPathSelector(response)
        raw_recipes = parse_recipes(hxs, {'source': self.source})
        for recipe in raw_recipes:
            if 'photo' in recipe:
                recipe['photo'] = flatten(recipe['photo'])
            if 'image' in recipe:
                recipe['image'] = flatten(recipe['image'])

        return [RecipeItem.from_dict(recipe) for recipe in raw_recipes]
Пример #4
0
    def parse_item(self, response):
        # skip review pages, which are hard to distinguish from recipe pages
        # in the link extractor regex
        if response.url.endswith('/review'):
            return []

        hxs = HtmlXPathSelector(response)
        raw_recipes = parse_recipes(hxs, {'source': self.source})
        for recipe in raw_recipes:
            if 'photo' in recipe:
                recipe['photo'] = flatten(recipe['photo'])
            if 'image' in recipe:
                recipe['image'] = flatten(recipe['image'])

        return [RecipeItem.from_dict(recipe) for recipe in raw_recipes]
Пример #5
0
    def parse_item(self, response):
        # skip review pages, which are hard to distinguish from recipe pages
        # in the link extractor regex
        if '/reviews/' in response.url:
            return []

        hxs = HtmlXPathSelector(response)
        raw_recipes = parse_recipes(hxs, {'source': self.source, 'url': response.url})
        for recipe in raw_recipes:
            if 'photo' in recipe:
                recipe['photo'] = flatten(recipe['photo'])
                recipe['photo'] = recipe['photo'].replace('_med.', '_lg.')
            if 'image' in recipe:
                recipe['image'] = flatten(recipe['image'])
                recipe['image'] = recipe['image'].replace('_med.', '_lg.')

        return [RecipeItem.from_dict(recipe) for recipe in raw_recipes]
Пример #6
0
    def parse_item(self, response):
        # skip review pages, which are hard to distinguish from recipe pages
        # in the link extractor regex
        if '/reviews/' in response.url:
            return []

        hxs = HtmlXPathSelector(response)
        raw_recipes = parse_recipes(hxs, {'source': self.source, 'url': response.url})
        for recipe in raw_recipes:
            if 'photo' in recipe:
                recipe['photo'] = flatten(recipe['photo'])
                recipe['photo'] = recipe['photo'].replace('_med.', '_lg.')
            if 'image' in recipe:
                recipe['image'] = flatten(recipe['image'])
                recipe['image'] = recipe['image'].replace('_med.', '_lg.')

        return [RecipeItem.from_dict(recipe) for recipe in raw_recipes]
Пример #7
0
    def parse_item(self, response):

      hxs = HtmlXPathSelector(response)
      raw_recipes = parse_recipes(hxs, {'source': self.source, 'url': response.url})

      return [RecipeItem.from_dict(recipe) for recipe in raw_recipes]