def get_metadata_from_result(result: ScraperResult) -> Dict[str, Any]: """Get the metadata that we're interested in out of a scrape result.""" if result.scraper_type != ScraperType.EMBEDLY: raise ValueError( "Can't process a result from a different scraper.") metadata = {} if result.data.get("title"): metadata["title"] = result.data["title"] if result.data.get("description"): metadata["description"] = result.data["description"] content = result.data.get("content") if content: metadata["word_count"] = word_count( extract_text_from_html(content)) if result.data.get("published"): # the field's value is in milliseconds, store it in seconds instead metadata["published"] = result.data["published"] // 1000 authors = result.data.get("authors") if authors: try: metadata["authors"] = [author["name"] for author in authors] except KeyError: pass return metadata
def _generate_text_metadata(topic: Topic) -> Dict[str, Any]: """Generate metadata for a text topic (word count and excerpt).""" extracted_text = extract_text_from_html(topic.rendered_html) # create a short excerpt by truncating the extracted string excerpt = truncate_string(extracted_text, length=200, truncate_at_chars=" ") return {"word_count": word_count(extracted_text), "excerpt": excerpt}
def upgrade(): op.add_column( "comments", sa.Column("excerpt", sa.Text(), server_default="", nullable=False)) # generate excerpts for all existing (non-deleted) comments session = sa.orm.Session(bind=op.get_bind()) comments = session.query(Comment).filter(Comment.is_deleted == False).all() for comment in comments: extracted_text = extract_text_from_html(comment.rendered_html) comment.excerpt = truncate_string(extracted_text, length=200, truncate_at_chars=" ") session.commit()
def markdown(self, new_markdown: str) -> None: """Set the comment's markdown and render its HTML.""" if new_markdown == self.markdown: return self._markdown = new_markdown self.rendered_html = convert_markdown_to_safe_html(new_markdown) extracted_text = extract_text_from_html( self.rendered_html, skip_tags=["blockquote", "del"]) self.excerpt = truncate_string(extracted_text, length=200, truncate_at_chars=" ") if self.age > EDIT_GRACE_PERIOD: self.last_edited_time = utc_now()