Python parse примеры использования

Язык программирования: Python

Пространство имен/Пакет: page_parser

Метод/Функция: parse

Примеров на hotexamples.com: 7

Python parse - 7 примеров найдено. Это лучшие примеры Python кода для page_parser.parse, полученные из open source проектов. Вы можете ставить оценку каждому примеру, чтобы помочь нам улучшить качество примеров.

Пример #1

Показать файл

Файл: decruft.py Проект: olethanh/decruft

    def get_article(self, candidates, best_candidate):
        # Now that we have the top candidate, look through its siblings for content that might also be related.
        # Things like preambles, content split by ads that we removed, etc.

        sibling_score_threshold = max([10, best_candidate['content_score'] * 0.2])
        output = parse("<div/>")
        for sibling in best_candidate['elem'].getparent().getchildren():
            #if isinstance(sibling, NavigableString): continue#in lxml there no concept of simple text
            append = False
            if sibling is best_candidate['elem']:
                append = True
            sibling_key = sibling #HashableElement(sibling)
            if sibling_key in candidates and candidates[sibling_key]['content_score'] >= sibling_score_threshold:
                append = True

            if sibling.tag == "p":
                link_density = self.get_link_density(sibling)
                node_content = sibling.text or ""
                node_length = len(node_content)

                if node_length > 80 and link_density < 0.25:
                    append = True
                elif node_length < 80 and link_density == 0 and re.search('\.( |$)', node_content):
                    append = True

            if append:
                output.append(sibling)
        if output is not None: output.append(best_candidate['elem'])
        return output

Пример #2

Показать файл

Файл: decruft.py Проект: vuquochuy/decruft

    def get_article(self, candidates, best_candidate):
        # Now that we have the top candidate, look through its siblings for content that might also be related.
        # Things like preambles, content split by ads that we removed, etc.

        sibling_score_threshold = max([10, best_candidate['content_score'] * 0.2])
        output = parse("<div/>")
        for sibling in best_candidate['elem'].getparent().getchildren():
            #if isinstance(sibling, NavigableString): continue#in lxml there no concept of simple text
            append = False
            if sibling is best_candidate['elem']:
                append = True
            sibling_key = sibling #HashableElement(sibling)
            if sibling_key in candidates and candidates[sibling_key]['content_score'] >= sibling_score_threshold:
                append = True

            if sibling.tag == "p":
                link_density = self.get_link_density(sibling)
                node_content = sibling.text or ""
                node_length = len(node_content)

                if node_length > 80 and link_density < 0.25:
                    append = True
                elif node_length < 80 and link_density == 0 and re.search('\.( |$)', node_content):
                    append = True

            if append:
                output.append(sibling)
        if output is not None: output.append(best_candidate['elem'])
        return output

Пример #3

Показать файл

Файл: readability.py Проект: samcal/HN2Kindle

	def _html(self, force=False):
		if force or self.html is None:
			try:
				self.html = parse(self.input, self.options['url'], notify=self.notify)
			except:
				return None
		return self.html

Пример #4

Показать файл

Файл: readability.py Проект: Nsl42/rssint

    def get_article(self, candidates, best_candidate):
        # Now that we have the top candidate, look through its siblings for content that might also be related.
        # Things like preambles, content split by ads that we removed, etc.

        sibling_score_threshold = max([10, best_candidate["content_score"] * 0.2])
        output = parse("<div/>")
        for sibling in best_candidate["elem"].parent.contents:
            if isinstance(sibling, NavigableString):
                continue
            append = False
            if sibling is best_candidate["elem"]:
                append = True
            sibling_key = HashableElement(sibling)
            if sibling_key in candidates and candidates[sibling_key]["content_score"] >= sibling_score_threshold:
                append = True

            if sibling.name == "p":
                link_density = self.get_link_density(sibling)
                node_content = sibling.string or ""
                node_length = len(node_content)

                if node_length > 80 and link_density < 0.25:
                    append = True
                elif node_length < 80 and link_density == 0 and re.search("\.( |$)", node_content):
                    append = True

            if append:
                output.append(sibling)

        if not output:
            output.append(best_candidate)
        return output

Пример #5

Показать файл

Файл: decruft.py Проект: olethanh/decruft

 def _html(self, force=False):
     if force or self.html is None:
         cleaner = Cleaner(scripts=True, javascript=True, comments=True,
                           style=True, links=True, meta=False, add_nofollow=False,
                           page_structure=False, processing_instructions=True, embedded=False,
                           frames=False, forms=False, annoying_tags=False, remove_tags=None,
                           remove_unknown_tags=False, safe_attrs_only=False)
         self.html = parse(cleaner.clean_html(self.input), self.options['url'], notify=self.notify)
     return self.html

Пример #6

Показать файл

Файл: decruft.py Проект: vuquochuy/decruft

 def _html(self, force=False):
     if force or self.html is None:
         cleaner = Cleaner(scripts=True, javascript=True, comments=True,
                           style=True, links=True, meta=False, add_nofollow=False,
                           page_structure=False, processing_instructions=True, embedded=False,
                           frames=False, forms=False, annoying_tags=False, remove_tags=None,
                           remove_unknown_tags=False, safe_attrs_only=False)
         self.html = parse(cleaner.clean_html(self.input), self.options['url'], notify=self.notify)
     return self.html

Пример #7

Показать файл

Файл: readability.py Проект: Nsl42/rssint

 def _html(self, force=False):
     if force or self.html is None:
         self.html = parse(self.input, self.options["url"], notify=self.notify)
     return self.html