def test_word_count_with_lots_of_punctuation(): """Ensure word count works properly with lots of punctuation.""" string = ( 'Even if "everyone" knows this should still work with a lot ' "-- a LOT -- of punctuation (or spécial characters), it's probably " "best not to count 100% on it; that's just foolish/risky.") assert word_count(string) == 31
def get_metadata_from_result(result: ScraperResult) -> Dict[str, Any]: """Get the metadata that we're interested in out of a scrape result.""" if result.scraper_type != ScraperType.EMBEDLY: raise ValueError( "Can't process a result from a different scraper.") metadata = {} if result.data.get("title"): metadata["title"] = result.data["title"] if result.data.get("description"): metadata["description"] = result.data["description"] content = result.data.get("content") if content: metadata["word_count"] = word_count( extract_text_from_html(content)) if result.data.get("published"): # the field's value is in milliseconds, store it in seconds instead metadata["published"] = result.data["published"] // 1000 authors = result.data.get("authors") if authors: try: metadata["authors"] = [author["name"] for author in authors] except KeyError: pass return metadata
def _generate_text_metadata(topic: Topic) -> Dict[str, Any]: """Generate metadata for a text topic (word count and excerpt).""" extracted_text = extract_text_from_html(topic.rendered_html) # create a short excerpt by truncating the extracted string excerpt = truncate_string(extracted_text, length=200, truncate_at_chars=" ") return {"word_count": word_count(extracted_text), "excerpt": excerpt}
def _generate_text_metadata(topic: Topic) -> None: """Generate metadata for a text topic (word count and excerpt).""" html_tree = HTMLParser().parseFragment(topic.rendered_html) # extract the text from all of the HTML elements extracted_text = ''.join( [element_text for element_text in html_tree.itertext()]) # sanitize unicode, remove leading/trailing whitespace, etc. extracted_text = simplify_string(extracted_text) # create a short excerpt by truncating the simplified string excerpt = truncate_string( extracted_text, length=200, truncate_at_chars=' ', ) topic.content_metadata = { 'word_count': word_count(extracted_text), 'excerpt': excerpt, }
def test_word_count_with_apostrophes(): """Ensure apostrophes don't mess up the word count.""" string = "It's not always false that apostrophes aren't counted properly." assert word_count(string) == 9
def test_simple_word_count(): """Ensure word-counting a simple string works as expected.""" string = 'Here is a simple string of words, nothing fancy.' assert word_count(string) == 9