Пример #1
0
	def parse_recipe(self, response):
		hxs = HtmlXPathSelector(response)

		for title in hxs.select('//h1').extract():
			break
		else:
			return []

		for picture in hxs.select("//img[@itemprop='photo']/@src").extract():
			picture = urljoin(response.url, picture)
			break
		else:
			picture = None

		ingredients, extra_ingredients = extract_extra_ingredients(
			(
				split_at_br(hxs.select(
					"//node()"
						"[preceding::h4["
							"starts-with(text(),'INGREDIENTS') or "
							"starts-with(text(),'Ingredients') or "
							"starts-with(text(),'ingredients')"
						"]]"
						"[following::h4["
							"starts-with(text(),'INSTRUCTIONS') or "
							"starts-with(text(),'Instructions') or "
							"starts-with(text(),'instructions') or"
							"starts-with(text(),'DIRECTIONS') or "
							"starts-with(text(),'Directions') or "
							"starts-with(text(),'directions')"
						"]]"
				)) or
				hxs.select('//div[count(*)=1]/b').extract() or
				split_at_br(hxs.select('//b//node()')) or
				hxs.select("//span[@style='font-weight: bold;']").extract()
			),
			lambda s: s.isupper()
		)

		if not ingredients:
			return []

		return [CocktailItem(
			title=html_to_text(title).strip(),
			picture=picture,
			url=response.url,
			source='Saveur',
			ingredients=ingredients,
			extra_ingredients=extra_ingredients
		)]
Пример #2
0
    def parse(self, response):
        hxs = HtmlXPathSelector(response)

        for title in hxs.select("//strong[normalize-space(text()) != '']"):
            lines = split_at_br(title.select("ancestor-or-self::node()/following-sibling::node()[not(self::span[starts-with(text(), 'Stir')])]"), include_blank=True)
            ingredients = []

            for line in lines[1 + (not lines[1][:1].isdigit()):]:
                line = html_to_text(line).strip()

                if not line:
                    break

                if re.search(r'\b(?:shaken?|stir(?:red)?|fill glass|preparation)\b', line, re.I):
                    break

                ingredients.append(line)

            yield CocktailItem(
                title=html_to_text(title.extract()).strip().rstrip('*').title(),
                picture=None,
                url=response.url,
                source="Dale DeGroff's",
                ingredients=ingredients
            )
Пример #3
0
	def parse_archive_recipes(self, response, scraped_recipes):
		hxs = HtmlXPathSelector(response)

		for i, title_node in enumerate(hxs.select('//u[b][not(parent::div)] | //div[u[b]]')):
			title = html_to_text(title_node.extract()).strip().strip('.').title()
			if title.lower() in scraped_recipes:
				continue

			ingredients = []
			for line in split_at_br(title_node.select('./following-sibling::node()[not(preceding::u[b][%d])]' % (i + 2)), include_blank=True, newline_elements=['br', 'div', 'b']) + ['']:
				line = html_to_text(line).strip()

				if not line:
					if len(ingredients) == 1:
						ingredients = []
					if ingredients:
						break
					continue

				ingredients.append(line)

			if not ingredients:
				continue

			yield CocktailItem(
				title=title,
				picture=None,
				url=response.url,
				source="Dr. Adam Elmegirab's",
				ingredients=ingredients
			)
Пример #4
0
    def parse(self, response):
        hxs = HtmlXPathSelector(response)

        for title in hxs.select("//strong[normalize-space(text()) != '']"):
            lines = split_at_br(title.select(
                "ancestor-or-self::node()/following-sibling::node()[not(self::span[starts-with(text(), 'Stir')])]"
            ),
                                include_blank=True)
            ingredients = []

            for line in lines[1 + (not lines[1][:1].isdigit()):]:
                line = html_to_text(line).strip()

                if not line:
                    break

                if re.search(
                        r'\b(?:shaken?|stir(?:red)?|fill glass|preparation)\b',
                        line, re.I):
                    break

                ingredients.append(line)

            yield CocktailItem(title=html_to_text(
                title.extract()).strip().rstrip('*').title(),
                               picture=None,
                               url=response.url,
                               source="Dale DeGroff's",
                               ingredients=ingredients)
Пример #5
0
	def parse_archive_recipes(self, response, scraped_recipes):
		hxs = HtmlXPathSelector(response)

		for i, title_node in enumerate(hxs.select('//u[b][not(parent::div)] | //div[u[b]]')):
			title = html_to_text(title_node.extract()).strip().strip('.').title()
			if title.lower() in scraped_recipes:
				continue

			ingredients = []
			for line in split_at_br(title_node.select('./following-sibling::node()[not(preceding::u[b][%d])]' % (i + 2)), include_blank=True, newline_elements=['br', 'div', 'b']) + ['']:
				line = html_to_text(line).strip()

				if not line:
					if len(ingredients) == 1:
						ingredients = []
					if ingredients:
						break
					continue

				ingredients.append(line)

			if not ingredients:
				continue

			yield CocktailItem(
				title=title,
				picture=None,
				url=response.url,
				source="Dr. Adam Elmegirab's",
				ingredients=ingredients
			)
Пример #6
0
    def parse_recipe(self, response):
        hxs = HtmlXPathSelector(response)

        for title in hxs.select('//h1').extract():
            break
        else:
            return []

        for picture in hxs.select("//img[@itemprop='photo']/@src").extract():
            picture = urljoin(response.url, picture)
            break
        else:
            picture = None

        ingredients, extra_ingredients = extract_extra_ingredients(
            (split_at_br(
                hxs.select("//node()"
                           "[preceding::h4["
                           "starts-with(text(),'INGREDIENTS') or "
                           "starts-with(text(),'Ingredients') or "
                           "starts-with(text(),'ingredients')"
                           "]]"
                           "[following::h4["
                           "starts-with(text(),'INSTRUCTIONS') or "
                           "starts-with(text(),'Instructions') or "
                           "starts-with(text(),'instructions') or"
                           "starts-with(text(),'DIRECTIONS') or "
                           "starts-with(text(),'Directions') or "
                           "starts-with(text(),'directions')"
                           "]]"))
             or hxs.select('//div[count(*)=1]/b').extract()
             or split_at_br(hxs.select('//b//node()'))
             or hxs.select("//span[@style='font-weight: bold;']").extract()),
            lambda s: s.isupper())

        if not ingredients:
            return []

        return [
            CocktailItem(title=html_to_text(title).strip(),
                         picture=picture,
                         url=response.url,
                         source='Saveur',
                         ingredients=ingredients,
                         extra_ingredients=extra_ingredients)
        ]
Пример #7
0
	def parse_recipe(self, response, num_recipes, scraped_recipes):
		hxs = HtmlXPathSelector(response)

		ingredients = []
		for paragraph in hxs.select('//p'):
			l = []

			for line in split_at_br(paragraph, include_blank=True) + ['']:
				line = html_to_text(line).strip()

				if line:
					l.append(line)
					continue

				if len(l) >= len(ingredients):
					ingredients = l
					paragraph_with_ingredients = paragraph

				l = []

		title = hxs.select("//text()[contains(self::text(), ' such as the ')]")
		if title:
			title = html_to_text(title[0].extract())
			title = re.search(r'(?<= such as the ).+?(?=,|;| created )', title).group(0)
		else:
			title = paragraph_with_ingredients.select('./preceding-sibling::p')[-1]
			title = html_to_text(title.extract()).rstrip(';')

		yield CocktailItem(
			title=title,
			picture=None,
			url=response.url,
			source="Dr. Adam Elmegirab's",
			ingredients=ingredients
		)

		scraped_recipes.add(title.lower())
		if len(scraped_recipes) == num_recipes:
			yield Request(
				urljoin(
					response.url,
					hxs.select("//a[text() = 'Archives']/@href")[0].extract()
				),
				partial(
					self.parse_archive,
					scraped_recipes=scraped_recipes
				)
			)
Пример #8
0
	def parse_recipe(self, response, num_recipes, scraped_recipes):
		hxs = HtmlXPathSelector(response)

		ingredients = []
		for paragraph in hxs.select('//p'):
			l = []

			for line in split_at_br(paragraph, include_blank=True) + ['']:
				line = html_to_text(line).strip()

				if line:
					l.append(line)
					continue

				if len(l) >= len(ingredients):
					ingredients = l
					paragraph_with_ingredients = paragraph

				l = []

		title = hxs.select("//text()[contains(self::text(), ' such as the ')]")
		if title:
			title = html_to_text(title[0].extract())
			title = re.search(r'(?<= such as the ).+?(?=,|;| created )', title).group(0)
		else:
			title = paragraph_with_ingredients.select('./preceding-sibling::p')[-1]
			title = html_to_text(title.extract()).rstrip(';')

		yield CocktailItem(
			title=title,
			picture=None,
			url=response.url,
			source="Dr. Adam Elmegirab's",
			ingredients=ingredients
		)

		scraped_recipes.add(title.lower())
		if len(scraped_recipes) == num_recipes:
			yield Request(
				urljoin(
					response.url,
					hxs.select("//a[text() = 'Archives']/@href")[0].extract()
				),
				partial(
					self.parse_archive,
					scraped_recipes=scraped_recipes
				)
			)
Пример #9
0
    def parse_recipe(self, response):
        hxs = HtmlXPathSelector(response)

        for title in hxs.select(xp_title).extract():
            break
        else:
            return []

        ingredients = []
        for ingredient in split_at_br(hxs.select(xp_ingredients)):
            if not ingredient.endswith(':'):
                ingredients.append(html_to_text(ingredient))

        return [CocktailItem(
            title=html_to_text(title).split(':')[-1].split('\u2013')[-1].strip(),
            picture=None,
            url=response.url,
            source='Monkey 47 Blog',
            ingredients=ingredients
        )]
Пример #10
0
    def parse_recipe(self, response):
        hxs = HtmlXPathSelector(response)

        for title in hxs.select(xp_title).extract():
            break
        else:
            return []

        ingredients = []
        for ingredient in split_at_br(hxs.select(xp_ingredients)):
            if not ingredient.endswith(':'):
                ingredients.append(html_to_text(ingredient))

        return [
            CocktailItem(title=html_to_text(title).split(':')[-1].split(
                u'\u2013')[-1].strip(),
                         picture=None,
                         url=response.url,
                         source='Monkey 47 Blog',
                         ingredients=ingredients)
        ]