Python html_to_text示例，cocktails.utils.html_to_text Python示例

示例#1

0

显示文件

文件： allPythonContent.py 项目： Mondego/pyreco

	def parse_archive_recipes(self, response, scraped_recipes):
		hxs = HtmlXPathSelector(response)

		for i, title_node in enumerate(hxs.select('//u[b][not(parent::div)] | //div[u[b]]')):
			title = html_to_text(title_node.extract()).strip().strip('.').title()
			if title.lower() in scraped_recipes:
				continue

			ingredients = []
			for line in split_at_br(title_node.select('./following-sibling::node()[not(preceding::u[b][%d])]' % (i + 2)), include_blank=True, newline_elements=['br', 'div', 'b']) + ['']:
				line = html_to_text(line).strip()

				if not line:
					if len(ingredients) == 1:
						ingredients = []
					if ingredients:
						break
					continue

				ingredients.append(line)

			if not ingredients:
				continue

			yield CocktailItem(
				title=title,
				picture=None,
				url=response.url,
				source="Dr. Adam Elmegirab's",
				ingredients=ingredients
			)

示例#2

0

显示文件

    def parse(self, response):
        hxs = HtmlXPathSelector(response)

        for title in hxs.select("//strong[normalize-space(text()) != '']"):
            lines = split_at_br(title.select(
                "ancestor-or-self::node()/following-sibling::node()[not(self::span[starts-with(text(), 'Stir')])]"
            ),
                                include_blank=True)
            ingredients = []

            for line in lines[1 + (not lines[1][:1].isdigit()):]:
                line = html_to_text(line).strip()

                if not line:
                    break

                if re.search(
                        r'\b(?:shaken?|stir(?:red)?|fill glass|preparation)\b',
                        line, re.I):
                    break

                ingredients.append(line)

            yield CocktailItem(title=html_to_text(
                title.extract()).strip().rstrip('*').title(),
                               picture=None,
                               url=response.url,
                               source="Dale DeGroff's",
                               ingredients=ingredients)

示例#3

0

显示文件

	def parse_archive_recipes(self, response, scraped_recipes):
		hxs = HtmlXPathSelector(response)

		for i, title_node in enumerate(hxs.select('//u[b][not(parent::div)] | //div[u[b]]')):
			title = html_to_text(title_node.extract()).strip().strip('.').title()
			if title.lower() in scraped_recipes:
				continue

			ingredients = []
			for line in split_at_br(title_node.select('./following-sibling::node()[not(preceding::u[b][%d])]' % (i + 2)), include_blank=True, newline_elements=['br', 'div', 'b']) + ['']:
				line = html_to_text(line).strip()

				if not line:
					if len(ingredients) == 1:
						ingredients = []
					if ingredients:
						break
					continue

				ingredients.append(line)

			if not ingredients:
				continue

			yield CocktailItem(
				title=title,
				picture=None,
				url=response.url,
				source="Dr. Adam Elmegirab's",
				ingredients=ingredients
			)

示例#4

0

显示文件

文件： kingcocktail.py 项目： snoack/cocktail-search

    def parse(self, response):
        hxs = HtmlXPathSelector(response)

        for title in hxs.select("//strong[normalize-space(text()) != '']"):
            lines = split_at_br(title.select("ancestor-or-self::node()/following-sibling::node()[not(self::span[starts-with(text(), 'Stir')])]"), include_blank=True)
            ingredients = []

            for line in lines[1 + (not lines[1][:1].isdigit()):]:
                line = html_to_text(line).strip()

                if not line:
                    break

                if re.search(r'\b(?:shaken?|stir(?:red)?|fill glass|preparation)\b', line, re.I):
                    break

                ingredients.append(line)

            yield CocktailItem(
                title=html_to_text(title.extract()).strip().rstrip('*').title(),
                picture=None,
                url=response.url,
                source="Dale DeGroff's",
                ingredients=ingredients
            )

示例#5

0

显示文件

文件： wikipedia.py 项目： snoack/cocktail-search

    def parse_recipes(self, response):
        hxs = HtmlXPathSelector(response)

        for url in hxs.select("//link[@rel='canonical']/@href").extract():
            url = urljoin(response.url, url)

            if url != response.url:
                yield Request(url, callback=self.parse_recipes)
                raise StopIteration

        for recipe in hxs.select(xp_recipes):
            for title in recipe.select('caption').extract():
                break
            else:
                continue

            ingredients = recipe.select(xp_ingredients).extract()
            if not ingredients:
                continue

            for picture in recipe.select("tr/td[@colspan='2']//img/@src | preceding-sibling::*[contains(concat(' ', normalize-space(@class), ' '), ' thumb ')]//img/@src").extract():
                picture = urljoin(response.url, picture)
                break
            else:
                picture = None

            yield CocktailItem(
                title=html_to_text(title),
                picture=picture,
                url=response.url,
                source='Wikipedia',
                ingredients=[html_to_text(x) for x in ingredients]
            )

示例#6

0

显示文件

文件： cocktailtimes.py 项目： thomasmodeneis/cocktail-search

    def parse_recipe(self, response):
        hxs = HtmlXPathSelector(response)

        ingredients = [
            html_to_text(s).split('-', 1)[1].strip()
            for s in hxs.select(xp_ingredients).extract()
        ]
        if not ingredients:
            return []

        for title in hxs.select(xp_header).extract():
            break
        else:
            return []

        for picture in hxs.select(xp_picture).extract():
            picture = urljoin(response.url, picture)
        else:
            picture = None

        return [
            CocktailItem(
                title=html_to_text(title),
                picture=picture,
                url=response.url,
                source='Cocktail Times',
                ingredients=ingredients,
            )
        ]

示例#7

0

显示文件

文件： allPythonContent.py 项目： Mondego/pyreco

	def parse_recipe(self, response, num_recipes, scraped_recipes):
		hxs = HtmlXPathSelector(response)

		ingredients = []
		for paragraph in hxs.select('//p'):
			l = []

			for line in split_at_br(paragraph, include_blank=True) + ['']:
				line = html_to_text(line).strip()

				if line:
					l.append(line)
					continue

				if len(l) >= len(ingredients):
					ingredients = l
					paragraph_with_ingredients = paragraph

				l = []

		title = hxs.select("//text()[contains(self::text(), ' such as the ')]")
		if title:
			title = html_to_text(title[0].extract())
			title = re.search(r'(?<= such as the ).+?(?=,|;| created )', title).group(0)
		else:
			title = paragraph_with_ingredients.select('./preceding-sibling::p')[-1]
			title = html_to_text(title.extract()).rstrip(';')

		yield CocktailItem(
			title=title,
			picture=None,
			url=response.url,
			source="Dr. Adam Elmegirab's",
			ingredients=ingredients
		)

		scraped_recipes.add(title.lower())
		if len(scraped_recipes) == num_recipes:
			yield Request(
				urljoin(
					response.url,
					hxs.select("//a[text() = 'Archives']/@href")[0].extract()
				),
				partial(
					self.parse_archive,
					scraped_recipes=scraped_recipes
				)
			)

示例#8

0

显示文件

	def parse_recipe(self, response, num_recipes, scraped_recipes):
		hxs = HtmlXPathSelector(response)

		ingredients = []
		for paragraph in hxs.select('//p'):
			l = []

			for line in split_at_br(paragraph, include_blank=True) + ['']:
				line = html_to_text(line).strip()

				if line:
					l.append(line)
					continue

				if len(l) >= len(ingredients):
					ingredients = l
					paragraph_with_ingredients = paragraph

				l = []

		title = hxs.select("//text()[contains(self::text(), ' such as the ')]")
		if title:
			title = html_to_text(title[0].extract())
			title = re.search(r'(?<= such as the ).+?(?=,|;| created )', title).group(0)
		else:
			title = paragraph_with_ingredients.select('./preceding-sibling::p')[-1]
			title = html_to_text(title.extract()).rstrip(';')

		yield CocktailItem(
			title=title,
			picture=None,
			url=response.url,
			source="Dr. Adam Elmegirab's",
			ingredients=ingredients
		)

		scraped_recipes.add(title.lower())
		if len(scraped_recipes) == num_recipes:
			yield Request(
				urljoin(
					response.url,
					hxs.select("//a[text() = 'Archives']/@href")[0].extract()
				),
				partial(
					self.parse_archive,
					scraped_recipes=scraped_recipes
				)
			)

示例#9

0

显示文件

def extract_extra_ingredients(nodes, is_section_header):
    section = None
    sections = OrderedDict()

    for node in nodes:
        text = node.extract() if isinstance(node, XPathSelector) else node
        text = html_to_text(text).strip()

        if not text:
            continue

        if is_section_header(node):
            section = text
            continue

        sections.setdefault(section, []).append(text)

    if None in sections:
        ingredients = sections.pop(None)
    elif sections:
        ingredients = sections.pop(sections.keys()[-1])
    else:
        ingredients = []

    extra_ingredients = [x for y in sections.values() for x in y]

    return (ingredients, extra_ingredients)

示例#10

0

显示文件

    def parse_recipes(self, response):
        hxs = HtmlXPathSelector(response)

        for url in hxs.select("//link[@rel='canonical']/@href").extract():
            url = urljoin(response.url, url)

            if url != response.url:
                yield Request(url, callback=self.parse_recipes)
                raise StopIteration

        for recipe in hxs.select(xp_recipes):
            for title in recipe.select('caption').extract():
                break
            else:
                continue

            ingredients = recipe.select(xp_ingredients).extract()
            if not ingredients:
                continue

            for picture in recipe.select(
                    "tr/td[@colspan='2']//img/@src | preceding-sibling::*[contains(concat(' ', normalize-space(@class), ' '), ' thumb ')]//img/@src"
            ).extract():
                picture = urljoin(response.url, picture)
                break
            else:
                picture = None

            yield CocktailItem(title=html_to_text(title),
                               picture=picture,
                               url=response.url,
                               source='Wikipedia',
                               ingredients=map(html_to_text, ingredients))

示例#11

0

显示文件

文件： snippet.py 项目： szabo92/gistable

	def parse_recipe(self, response, title, picture):
		hxs = HtmlXPathSelector(response)

		section = None
		sections = OrderedDict()

		for node in hxs.select(xp_ingredients):
			text = html_to_text(node.extract()).strip()

			if not text:
				continue

			if node.select('strong'):
				section = text
				continue

			sections.setdefault(section, []).append(text)

		ingredients = sections.pop(None, None) or sections.pop(sections.keys()[-1])
		extra_ingredients = [x for y in sections.values() for x in y]

		yield CocktailItem(
			title=title,
			picture=picture,
			url=response.url,
			ingredients=ingredients,
			extra_ingredients=extra_ingredients
)

示例#12

0

显示文件

    def parse_recipe(self, response):
        hxs = HtmlXPathSelector(response)

        for title in hxs.select('//h1').extract():
            break
        else:
            return []

        for picture in hxs.select("//img[@itemprop='photo']/@src").extract():
            picture = urljoin(response.url, picture)
            break
        else:
            picture = None

        ingredients = hxs.select("//*[@itemprop='ingredient']").extract()

        return [
            CocktailItem(
                title=html_to_text(title),
                picture=picture,
                url=response.url,
                source='Liquor.com',
                ingredients=map(html_to_text, ingredients),
            )
        ]

示例#13

0

显示文件

文件： seriouseats.py 项目： arunpn/cocktail-search

	def parse_recipe(self, response, title, picture):
		hxs = HtmlXPathSelector(response)

		section = None
		sections = OrderedDict()

		for node in hxs.select(xp_ingredients):
			text = html_to_text(node.extract()).strip()

			if not text:
				continue

			if node.select('strong'):
				section = text
				continue

			sections.setdefault(section, []).append(text)

		ingredients = sections.pop(None, None) or sections.pop(sections.keys()[-1])
		extra_ingredients = [x for y in sections.values() for x in y]

		yield CocktailItem(
			title=title,
			picture=picture,
			url=response.url,
			ingredients=ingredients,
			extra_ingredients=extra_ingredients
		)

示例#14

0

显示文件

文件： allPythonContent.py 项目： Mondego/pyreco

def extract_extra_ingredients(nodes, is_section_header):
	section = None
	sections = OrderedDict()

	for node in nodes:
		text = node.extract() if isinstance(node, XPathSelector) else node
		text = html_to_text(text).strip()

		if not text:
			continue

		if is_section_header(node):
			section = text
			continue

		sections.setdefault(section, []).append(text)

	if None in sections:
		ingredients = sections.pop(None)
	elif sections:
		ingredients = sections.pop(sections.keys()[-1])
	else:
		ingredients = []

	extra_ingredients = [x for y in sections.values() for x in y]

	return (ingredients, extra_ingredients)

示例#15

0

显示文件

文件： cocktaildb.py 项目： snoack/cocktail-search

    def parse_recipe(self, response):
        hxs = HtmlXPathSelector(response)

        for title in hxs.select('//h2').extract():
            break
        else:
            return []

        ingredients = hxs.select(xp_ingredients).extract()

        return [CocktailItem(
            title=html_to_text(title),
            picture=None,
            url=response.url,
            source='CocktailDB',
            ingredients=[html_to_text(x) for x in ingredients],
        )]

示例#16

0

显示文件

文件： drinksmixer.py 项目： snoack/cocktail-search

    def parse_recipe(self, response):
        hxs = HtmlXPathSelector(response)

        for title in hxs.select(xp_title).extract():
            break
        else:
            return []

        ingredients = hxs.select(xp_ingredients).extract()

        return [CocktailItem(
            title=re.sub(r'\s+recipe$', '', html_to_text(title)),
            picture=None,
            url=response.url,
            source='Drinks Mixer',
            ingredients=[html_to_text(x) for x in ingredients],
        )]

示例#17

0

显示文件

文件： monkey47.py 项目： snoack/cocktail-search

    def parse_recipe(self, response):
        hxs = HtmlXPathSelector(response)

        for title in hxs.select(xp_title).extract():
            break
        else:
            return []

        ingredients = []
        for ingredient in split_at_br(hxs.select(xp_ingredients)):
            if not ingredient.endswith(':'):
                ingredients.append(html_to_text(ingredient))

        return [CocktailItem(
            title=html_to_text(title).split(':')[-1].split('\u2013')[-1].strip(),
            picture=None,
            url=response.url,
            source='Monkey 47 Blog',
            ingredients=ingredients
        )]

示例#18

0

显示文件

文件： monkey47.py 项目： thomasmodeneis/cocktail-search

    def parse_recipe(self, response):
        hxs = HtmlXPathSelector(response)

        for title in hxs.select(xp_title).extract():
            break
        else:
            return []

        ingredients = []
        for ingredient in split_at_br(hxs.select(xp_ingredients)):
            if not ingredient.endswith(':'):
                ingredients.append(html_to_text(ingredient))

        return [
            CocktailItem(title=html_to_text(title).split(':')[-1].split(
                u'\u2013')[-1].strip(),
                         picture=None,
                         url=response.url,
                         source='Monkey 47 Blog',
                         ingredients=ingredients)
        ]

示例#19

0

显示文件

文件： ohgosh.py 项目： snoack/cocktail-search

    def parse_recipes(self, response, recipe_urls):
        hxs = HtmlXPathSelector(response)

        for url in recipe_urls:
            node = hxs.select("//*[@id='%s']" % urlparse(url).fragment)[0]

            for picture in node.select('./preceding-sibling::*[1]/img/@src').extract():
                picture = urljoin(url, picture)
                break
            else:
                picture = None

            ingredients = node.select('./following-sibling::*[position()<=2]/li').extract()

            yield CocktailItem(
                title=html_to_text(node.extract()),
                picture=picture,
                url=url,
                source='Oh Gosh!',
                ingredients=[html_to_text(x) for x in ingredients],
            )

示例#20

0

显示文件

文件： saveur.py 项目： WillJHaggard/cocktail-search

	def parse_recipe(self, response):
		hxs = HtmlXPathSelector(response)

		for title in hxs.select('//h1').extract():
			break
		else:
			return []

		for picture in hxs.select("//img[@itemprop='photo']/@src").extract():
			picture = urljoin(response.url, picture)
			break
		else:
			picture = None

		ingredients, extra_ingredients = extract_extra_ingredients(
			(
				split_at_br(hxs.select(
					"//node()"
						"[preceding::h4["
							"starts-with(text(),'INGREDIENTS') or "
							"starts-with(text(),'Ingredients') or "
							"starts-with(text(),'ingredients')"
						"]]"
						"[following::h4["
							"starts-with(text(),'INSTRUCTIONS') or "
							"starts-with(text(),'Instructions') or "
							"starts-with(text(),'instructions') or"
							"starts-with(text(),'DIRECTIONS') or "
							"starts-with(text(),'Directions') or "
							"starts-with(text(),'directions')"
						"]]"
				)) or
				hxs.select('//div[count(*)=1]/b').extract() or
				split_at_br(hxs.select('//b//node()')) or
				hxs.select("//span[@style='font-weight: bold;']").extract()
			),
			lambda s: s.isupper()
		)

		if not ingredients:
			return []

		return [CocktailItem(
			title=html_to_text(title).strip(),
			picture=picture,
			url=response.url,
			source='Saveur',
			ingredients=ingredients,
			extra_ingredients=extra_ingredients
		)]

示例#21

0

显示文件

文件： drinkboy.py 项目： snoack/cocktail-search

    def parse_recipe(self, response):
        hxs = HtmlXPathSelector(response)

        for title in hxs.select("//*[@itemprop='name']").extract():
            break
        else:
            return []

        for picture in hxs.select("//img[@itemprop='image']/@src").extract():
            picture = urljoin(response.url, picture)
            break
        else:
            picture = None

        ingredients = hxs.select(xp_ingredients).extract()

        return [CocktailItem(
            title=html_to_text(title),
            picture=picture,
            url=response.url,
            source='DrinkBoy',
            ingredients=[html_to_text(x) for x in ingredients],
        )]

示例#22

0

显示文件

文件： cocktailtimes.py 项目： snoack/cocktail-search

    def parse_recipe(self, response):
        hxs = HtmlXPathSelector(response)

        ingredients = [html_to_text(s).split('-', 1)[1].strip() for s in hxs.select(xp_ingredients).extract()]
        if not ingredients:
            return []

        for title in hxs.select(xp_header).extract():
            break
        else:
            return []

        for picture in hxs.select(xp_picture).extract():
            picture = urljoin(response.url, picture)
        else:
            picture = None

        return [CocktailItem(
            title=html_to_text(title),
            picture=picture,
            url=response.url,
            source='Cocktail Times',
            ingredients=ingredients,
        )]

示例#23

0

显示文件

    def parse_recipe(self, response):
        hxs = HtmlXPathSelector(response)

        for title in hxs.select(
                "//meta[@property='og:title']/@content").extract():
            break
        else:
            return []

        for picture in hxs.select(
                "//*[@id='drink_infopicvid']/img/@src").extract():
            picture = urljoin(response.url, picture)
            break
        else:
            picture = None

        ingredients = []
        for node in hxs.select("//ul[@id='ingredients']/li"):
            parts = []

            for child in node.select('* | text()'):
                text = html_to_text(child.extract())

                if 'ingredient' in (child.xmlNode.prop('class') or '').split():
                    text = text.split('--')[-1]

                text = text.strip()

                if not text:
                    continue

                parts.append(text)

            ingredients.append(' '.join(parts))

        # don't crawl recipes like 'American Whiskey & Canadian Whisky',
        # that only consist of pouring a single spirit into a glass.
        if len(ingredients) <= 1:
            return []

        return [
            CocktailItem(title=unescape(title),
                         picture=picture,
                         url=response.url,
                         source='Esquire',
                         ingredients=ingredients)
        ]

示例#24

0

显示文件

    def parse_recipe(self, response):
        hxs = HtmlXPathSelector(response)

        for title in hxs.select('//h1').extract():
            break
        else:
            return []

        for picture in hxs.select("//img[@itemprop='photo']/@src").extract():
            picture = urljoin(response.url, picture)
            break
        else:
            picture = None

        ingredients, extra_ingredients = extract_extra_ingredients(
            (split_at_br(
                hxs.select("//node()"
                           "[preceding::h4["
                           "starts-with(text(),'INGREDIENTS') or "
                           "starts-with(text(),'Ingredients') or "
                           "starts-with(text(),'ingredients')"
                           "]]"
                           "[following::h4["
                           "starts-with(text(),'INSTRUCTIONS') or "
                           "starts-with(text(),'Instructions') or "
                           "starts-with(text(),'instructions') or"
                           "starts-with(text(),'DIRECTIONS') or "
                           "starts-with(text(),'Directions') or "
                           "starts-with(text(),'directions')"
                           "]]"))
             or hxs.select('//div[count(*)=1]/b').extract()
             or split_at_br(hxs.select('//b//node()'))
             or hxs.select("//span[@style='font-weight: bold;']").extract()),
            lambda s: s.isupper())

        if not ingredients:
            return []

        return [
            CocktailItem(title=html_to_text(title).strip(),
                         picture=picture,
                         url=response.url,
                         source='Saveur',
                         ingredients=ingredients,
                         extra_ingredients=extra_ingredients)
        ]

示例#25

0

显示文件

文件： esquire.py 项目： snoack/cocktail-search

    def parse_recipe(self, response):
        hxs = HtmlXPathSelector(response)

        for title in hxs.select("//meta[@property='og:title']/@content").extract():
            break
        else:
            return []

        for picture in hxs.select("//*[@id='drink_infopicvid']/img/@src").extract():
            picture = urljoin(response.url, picture)
            break
        else:
            picture = None

        ingredients = []
        for node in hxs.select("//ul[@id='ingredients']/li"):
            parts = []

            for child in node.select('* | text()'):
                text = html_to_text(child.extract())

                if 'ingredient' in (child.xmlNode.prop('class') or '').split():
                    text = text.split('--')[-1]

                text = text.strip()

                if not text:
                    continue

                parts.append(text)

            ingredients.append(' '.join(parts))

        # don't crawl recipes like 'American Whiskey & Canadian Whisky',
        # that only consist of pouring a single spirit into a glass.
        if len(ingredients) <= 1:
            return []

        return [CocktailItem(
            title=unescape(title),
            picture=picture,
            url=response.url,
            source='Esquire',
            ingredients=ingredients
        )]

示例#26

0

显示文件

    def parse_recipe(self, response):
        hxs = HtmlXPathSelector(response)

        for title in hxs.select(xp_title).extract():
            break
        else:
            return []

        ingredients = hxs.select(xp_ingredients).extract()

        return [
            CocktailItem(
                title=re.sub(r'\s+recipe$', '', html_to_text(title)),
                picture=None,
                url=response.url,
                source='Drinks Mixer',
                ingredients=map(html_to_text, ingredients),
            )
        ]

示例#27

0

显示文件

文件： cocktaildb.py 项目： thomasmodeneis/cocktail-search

    def parse_recipe(self, response):
        hxs = HtmlXPathSelector(response)

        for title in hxs.select('//h2').extract():
            break
        else:
            return []

        ingredients = hxs.select(xp_ingredients).extract()

        return [
            CocktailItem(
                title=html_to_text(title),
                picture=None,
                url=response.url,
                source='CocktailDB',
                ingredients=map(html_to_text, ingredients),
            )
        ]

示例#28

0

显示文件

文件： liquor.py 项目： arunpn/cocktail-search

	def parse_recipe(self, response):
		hxs = HtmlXPathSelector(response)

		for title in hxs.select('//h1').extract():
			break
		else:
			return []

		for picture in hxs.select("//img[@itemprop='photo']/@src").extract():
			picture = urljoin(response.url, picture)
			break
		else:
			picture = None

		ingredients = hxs.select("//*[@itemprop='ingredient']").extract()

		return [CocktailItem(
			title=html_to_text(title),
			picture=picture,
			url=response.url,
			ingredients=map(html_to_text, ingredients),
		)]

示例#29

0

显示文件

    def parse_recipes(self, response, recipe_urls):
        hxs = HtmlXPathSelector(response)

        for url in recipe_urls:
            node = hxs.select("//*[@id='%s']" % urlparse(url).fragment)[0]

            for picture in node.select(
                    './preceding-sibling::*[1]/img/@src').extract():
                picture = urljoin(url, picture)
                break
            else:
                picture = None

            ingredients = node.select(
                './following-sibling::*[position()<=2]/li').extract()

            yield CocktailItem(
                title=html_to_text(node.extract()),
                picture=picture,
                url=url,
                source='Oh Gosh!',
                ingredients=map(html_to_text, ingredients),
            )