def truncate_xhtml(string, size, _strip_xhtml=False, _decode_entities=False): """Truncate a XHTML string to roughly a given size (full words). :param string: XHTML :type string: unicode :param size: Max length :param _strip_xhtml: Flag to strip out all XHTML :param _decode_entities: Flag to convert XHTML entities to unicode chars :rtype: unicode """ if not string: return u'' if _strip_xhtml: # Insert whitespace after block elements. # So they are separated when we strip the xhtml. string = block_spaces.sub(u"\\1 ", string) string = strip_xhtml(string) string = decode_entities(string) if len(string) > size: string = text.truncate(string, length=size, whole_word=True) if _strip_xhtml: if not _decode_entities: # re-encode the entities, if we have to. string = encode_entities(string) else: string = clean(string, **cleaner_settings) return string.strip()
def strip_xhtml(string, _decode_entities=False): """Strip out xhtml and optionally convert HTML entities to unicode. :rtype: unicode """ if not string: return u'' string = ''.join(BeautifulSoup(string).findAll(text=True)) if _decode_entities: string = decode_entities(string) return string
def excerpt_xhtml(string, size, buffer=60): """Return an excerpt for the given string. Truncate to the given size iff we are removing more than the buffer size. :param string: A XHTML string :param size: The desired length :type size: int :param buffer: How much more than the desired length we can go to avoid truncating just a couple words etc. :type buffer: int :returns: XHTML """ if not string: return u'' new_str = decode_entities(string) if len(new_str) <= size + buffer: return string return truncate_xhtml(new_str, size)
def strip_xhtml(string, _decode_entities=False): """Strip out xhtml and optionally convert HTML entities to unicode. :rtype: unicode """ if not string: return u'' # Strip all xhtml: string = clean(string, strip=True, tags=[], attributes=[]) # Decode entities and strip hidden xhtml markup: string = decode_entities(string) # Note that decode_entities() call above takes care of xhtml # striping of hidden markup, and clean() below will re-encode # escapable characters if they got removed and we weren't # supposed to unescape them: if not _decode_entities: string = clean(string) return string