Пример #1
0
    def get_article(self, candidates, best_candidate):
        # Now that we have the top candidate, look through its siblings for content that might also be related.
        # Things like preambles, content split by ads that we removed, etc.

        sibling_score_threshold = max([10, best_candidate['content_score'] * 0.2])
        output = parse("<div/>")
        for sibling in best_candidate['elem'].getparent().getchildren():
            #if isinstance(sibling, NavigableString): continue#in lxml there no concept of simple text
            append = False
            if sibling is best_candidate['elem']:
                append = True
            sibling_key = sibling #HashableElement(sibling)
            if sibling_key in candidates and candidates[sibling_key]['content_score'] >= sibling_score_threshold:
                append = True

            if sibling.tag == "p":
                link_density = self.get_link_density(sibling)
                node_content = sibling.text or ""
                node_length = len(node_content)

                if node_length > 80 and link_density < 0.25:
                    append = True
                elif node_length < 80 and link_density == 0 and re.search('\.( |$)', node_content):
                    append = True

            if append:
                output.append(sibling)
        if output is not None: output.append(best_candidate['elem'])
        return output
Пример #2
0
    def get_article(self, candidates, best_candidate):
        # Now that we have the top candidate, look through its siblings for content that might also be related.
        # Things like preambles, content split by ads that we removed, etc.

        sibling_score_threshold = max([10, best_candidate['content_score'] * 0.2])
        output = parse("<div/>")
        for sibling in best_candidate['elem'].getparent().getchildren():
            #if isinstance(sibling, NavigableString): continue#in lxml there no concept of simple text
            append = False
            if sibling is best_candidate['elem']:
                append = True
            sibling_key = sibling #HashableElement(sibling)
            if sibling_key in candidates and candidates[sibling_key]['content_score'] >= sibling_score_threshold:
                append = True

            if sibling.tag == "p":
                link_density = self.get_link_density(sibling)
                node_content = sibling.text or ""
                node_length = len(node_content)

                if node_length > 80 and link_density < 0.25:
                    append = True
                elif node_length < 80 and link_density == 0 and re.search('\.( |$)', node_content):
                    append = True

            if append:
                output.append(sibling)
        if output is not None: output.append(best_candidate['elem'])
        return output
Пример #3
0
	def _html(self, force=False):
		if force or self.html is None:
			try:
				self.html = parse(self.input, self.options['url'], notify=self.notify)
			except:
				return None
		return self.html
Пример #4
0
    def get_article(self, candidates, best_candidate):
        # Now that we have the top candidate, look through its siblings for content that might also be related.
        # Things like preambles, content split by ads that we removed, etc.

        sibling_score_threshold = max([10, best_candidate["content_score"] * 0.2])
        output = parse("<div/>")
        for sibling in best_candidate["elem"].parent.contents:
            if isinstance(sibling, NavigableString):
                continue
            append = False
            if sibling is best_candidate["elem"]:
                append = True
            sibling_key = HashableElement(sibling)
            if sibling_key in candidates and candidates[sibling_key]["content_score"] >= sibling_score_threshold:
                append = True

            if sibling.name == "p":
                link_density = self.get_link_density(sibling)
                node_content = sibling.string or ""
                node_length = len(node_content)

                if node_length > 80 and link_density < 0.25:
                    append = True
                elif node_length < 80 and link_density == 0 and re.search("\.( |$)", node_content):
                    append = True

            if append:
                output.append(sibling)

        if not output:
            output.append(best_candidate)
        return output
Пример #5
0
 def _html(self, force=False):
     if force or self.html is None:
         cleaner = Cleaner(scripts=True, javascript=True, comments=True,
                           style=True, links=True, meta=False, add_nofollow=False,
                           page_structure=False, processing_instructions=True, embedded=False,
                           frames=False, forms=False, annoying_tags=False, remove_tags=None,
                           remove_unknown_tags=False, safe_attrs_only=False)
         self.html = parse(cleaner.clean_html(self.input), self.options['url'], notify=self.notify)
     return self.html
Пример #6
0
 def _html(self, force=False):
     if force or self.html is None:
         cleaner = Cleaner(scripts=True, javascript=True, comments=True,
                           style=True, links=True, meta=False, add_nofollow=False,
                           page_structure=False, processing_instructions=True, embedded=False,
                           frames=False, forms=False, annoying_tags=False, remove_tags=None,
                           remove_unknown_tags=False, safe_attrs_only=False)
         self.html = parse(cleaner.clean_html(self.input), self.options['url'], notify=self.notify)
     return self.html
Пример #7
0
 def _html(self, force=False):
     if force or self.html is None:
         self.html = parse(self.input, self.options["url"], notify=self.notify)
     return self.html