示例#1
0
    def get_metadata_from_result(result: ScraperResult) -> Dict[str, Any]:
        """Get the metadata that we're interested in out of a scrape result."""
        if result.scraper_type != ScraperType.EMBEDLY:
            raise ValueError(
                "Can't process a result from a different scraper.")

        metadata = {}

        if result.data.get("title"):
            metadata["title"] = result.data["title"]

        if result.data.get("description"):
            metadata["description"] = result.data["description"]

        content = result.data.get("content")
        if content:
            metadata["word_count"] = word_count(
                extract_text_from_html(content))

        if result.data.get("published"):
            # the field's value is in milliseconds, store it in seconds instead
            metadata["published"] = result.data["published"] // 1000

        authors = result.data.get("authors")
        if authors:
            try:
                metadata["authors"] = [author["name"] for author in authors]
            except KeyError:
                pass

        return metadata
    def _generate_text_metadata(topic: Topic) -> Dict[str, Any]:
        """Generate metadata for a text topic (word count and excerpt)."""
        extracted_text = extract_text_from_html(topic.rendered_html)

        # create a short excerpt by truncating the extracted string
        excerpt = truncate_string(extracted_text, length=200, truncate_at_chars=" ")

        return {"word_count": word_count(extracted_text), "excerpt": excerpt}
def upgrade():
    op.add_column(
        "comments",
        sa.Column("excerpt", sa.Text(), server_default="", nullable=False))

    # generate excerpts for all existing (non-deleted) comments
    session = sa.orm.Session(bind=op.get_bind())
    comments = session.query(Comment).filter(Comment.is_deleted == False).all()
    for comment in comments:
        extracted_text = extract_text_from_html(comment.rendered_html)
        comment.excerpt = truncate_string(extracted_text,
                                          length=200,
                                          truncate_at_chars=" ")
    session.commit()
示例#4
0
    def markdown(self, new_markdown: str) -> None:
        """Set the comment's markdown and render its HTML."""
        if new_markdown == self.markdown:
            return

        self._markdown = new_markdown
        self.rendered_html = convert_markdown_to_safe_html(new_markdown)

        extracted_text = extract_text_from_html(
            self.rendered_html, skip_tags=["blockquote", "del"])
        self.excerpt = truncate_string(extracted_text,
                                       length=200,
                                       truncate_at_chars=" ")

        if self.age > EDIT_GRACE_PERIOD:
            self.last_edited_time = utc_now()