示例#1
0
    def _readable(self):
        """The readable parsed article"""
        if self.candidates:
            LOG.debug('Candidates found:')
            pp = PrettyPrinter(indent=2)

            # cleanup by removing the should_drop we spotted.
            [n.drop_tree() for n in self._should_drop
                if n.getparent() is not None]

            # right now we return the highest scoring candidate content
            by_score = sorted([c for c in self.candidates.values()],
                key=attrgetter('content_score'), reverse=True)
            LOG.debug(pp.pformat(by_score))

            # since we have several candidates, check the winner's siblings
            # for extra content
            winner = by_score[0]
            LOG.debug('Selected winning node: ' + str(winner))
            updated_winner = check_siblings(winner, self.candidates)
            LOG.debug('Begin final prep of article')
            updated_winner.node = prep_article(updated_winner.node)
            if updated_winner.node is not None:
                doc = build_base_document(updated_winner.node, self.fragment)
            else:
                LOG.warning('Had candidates but failed to find a cleaned winning doc.')
                doc = self._handle_no_candidates()
        else:
            LOG.warning('No candidates found: using document.')
            LOG.debug('Begin final prep of article')
            doc = self._handle_no_candidates()

        return doc
示例#2
0
    def _readable(self):
        """The readable parsed article"""
        doc = self.orig.html
        # cleaning doesn't return, just wipes in place
        html_cleaner(doc)
        doc = drop_tag(doc, "noscript", "iframe")
        doc = transform_misused_divs_into_paragraphs(doc)
        candidates, should_drop = find_candidates(doc)

        if candidates:
            LOG.debug("Candidates found:")
            pp = PrettyPrinter(indent=2)

            # right now we return the highest scoring candidate content
            by_score = sorted([c for c in candidates.values()], key=attrgetter("content_score"), reverse=True)
            LOG.debug(pp.pformat(by_score))

            # since we have several candidates, check the winner's siblings
            # for extra content
            winner = by_score[0]
            LOG.debug("Selected winning node: " + str(winner))
            updated_winner = check_siblings(winner, candidates)
            LOG.debug("Begin final prep of article")
            updated_winner.node = prep_article(updated_winner.node)
            doc = build_base_document(updated_winner.node, self.fragment)
        else:
            LOG.warning("No candidates found: using document.")
            LOG.debug("Begin final prep of article")
            # since we've not found a good candidate we're should help this
            # cleanup by removing the should_drop we spotted.
            [n.drop_tree() for n in should_drop]
            doc = prep_article(doc)
            doc = build_base_document(doc, self.fragment)

        return doc
示例#3
0
    def _handle_no_candidates(self):
        """If we fail to find a good candidate we need to find something else."""
        # since we've not found a good candidate we're should help this
        if self.doc is not None and len(self.doc):
            # cleanup by removing the should_drop we spotted.
            [n.drop_tree() for n in self._should_drop
                if n.getparent() is not None]
            doc = prep_article(self.doc)
            doc = build_base_document(doc, self.fragment)
        else:
            LOG.warning('No document to use.')
            doc = build_error_document(self.fragment)

        return doc
示例#4
0
    def _handle_no_candidates(self):
        """If we fail to find a good candidate we need to find something else."""
        # since we've not found a good candidate we're should help this
        if self.doc is not None and len(self.doc):
            # cleanup by removing the should_drop we spotted.
            [
                n.drop_tree() for n in self._should_drop
                if n.getparent() is not None
            ]
            doc = prep_article(self.doc)
            doc = build_base_document(doc, self.fragment)
        else:
            LOG.warning('No document to use.')
            doc = build_error_document(self.fragment)

        return doc
示例#5
0
    def _readable(self):
        """The readable parsed article"""
        if self.candidates:
            LOG.debug('Candidates found:')
            pp = PrettyPrinter(indent=2)

            # cleanup by removing the should_drop we spotted.
            [
                n.drop_tree() for n in self._should_drop
                if n.getparent() is not None
            ]

            # right now we return the highest scoring candidate content
            by_score = sorted([c for c in self.candidates.values()],
                              key=attrgetter('content_score'),
                              reverse=True)
            LOG.debug(pp.pformat(by_score))

            # since we have several candidates, check the winner's siblings
            # for extra content
            winner = by_score[0]
            LOG.debug('Selected winning node: ' + str(winner))
            updated_winner = check_siblings(winner, self.candidates)
            LOG.debug('Begin final prep of article')
            updated_winner.node = prep_article(updated_winner.node)
            if updated_winner.node is not None:
                doc = build_base_document(updated_winner.node, self.fragment)
            else:
                LOG.warning(
                    'Had candidates but failed to find a cleaned winning doc.')
                doc = self._handle_no_candidates()
        else:
            LOG.warning('No candidates found: using document.')
            LOG.debug('Begin final prep of article')
            doc = self._handle_no_candidates()

        return doc