def test_strip_text(): """ Remove test text. Args: """ assert helpers.strip_text(" text ", []) == "text"
def moothrix_parse(html): if not html: return html, [] parser = MethrixParser() parser.feed(_add_surrogate(html)) text = helpers.strip_text(parser.text, parser.entities) return _del_surrogate(text), parser.entities
def text_and_format_entities_split(plain_text: str, format_entities: Sequence[TypeMessageEntity], length_limit_head: int = 4096, head_count: int = -1, length_limit_tail: int = 4096) \ -> list[tuple[str, list[TypeMessageEntity]]]: format_entities = merge_contiguous_entities(copy_entities(format_entities)) # sort and merge chunks = [] pending_text = plain_text pending_entities = format_entities[:] surrogate_len_sum = 0 while pending_text: curr_length_limit = length_limit_head if head_count <= -1 or len(chunks) < head_count else length_limit_tail curr_length_limit = min(curr_length_limit, len(pending_text)) # note: Telegram only allows up to 10000-Byte formatting entities per message # here the limit is set to 9500 Bytes to avoid possible problems if (len(pending_text) == curr_length_limit and not (len(pending_entities) > 100 or len(b''.join(x._bytes() for x in pending_entities)) >= 9500)): if surrogate_len_sum > 0: for entity in pending_entities: entity.offset -= surrogate_len_sum chunks.append((pending_text, pending_entities)) break for curr_length_limit in range(curr_length_limit, 0, -100): try: for sep in ('\n', '。', '. ', ';', '; ', ',', ', ', '?', '? ', '!', '! ', ':', ': ', '\t', ' ', '\xa0', ''): sep_pos = pending_text.rfind(sep, int(curr_length_limit * 0.5), curr_length_limit) if sep_pos != -1: curr_text = pending_text[:sep_pos + len(sep)] surrogate_end_pos = surrogate_len_sum + surrogate_len(curr_text) _curr_entities = filter_entities_by_range(surrogate_len_sum, surrogate_end_pos, pending_entities) if len(_curr_entities) > 100 or len(b''.join(x._bytes() for x in _curr_entities)) >= 9500: raise OverflowError('Too many entities') curr_entities, pending_entities = split_entities(surrogate_end_pos, pending_entities) if surrogate_len_sum > 0: for entity in curr_entities: entity.offset -= surrogate_len_sum surrogate_len_sum = surrogate_end_pos chunks.append((curr_text, curr_entities)) pending_text = pending_text[sep_pos + len(sep):] break break except OverflowError: pass stripped_chunks = [] for text, entity in chunks: text = strip_text(text, entity) stripped_chunks.append((text, entity)) return stripped_chunks
def test_strip_text(): assert helpers.strip_text(" text ", []) == "text"
def parse(message, delimiters=None, url_re=None): """ Parses the given markdown message and returns its stripped representation plus a list of the MessageEntity's that were found. :param message: the message with markdown-like syntax to be parsed. :param delimiters: the delimiters to be used, {delimiter: type}. :param url_re: the URL bytes regex to be used. Must have two groups. :return: a tuple consisting of (clean message, [message entities]). """ if not message: return message, [] if url_re is None: url_re = DEFAULT_URL_RE elif isinstance(url_re, str): url_re = re.compile(url_re) if not delimiters: if delimiters is not None: return message, [] delimiters = DEFAULT_DELIMITERS # Build a regex to efficiently test all delimiters at once. # Note that the largest delimiter should go first, we don't # want ``` to be interpreted as a single back-tick in a code block. delim_re = re.compile('|'.join( '({})'.format(re.escape(k)) for k in sorted(delimiters, key=len, reverse=True))) # Cannot use a for loop because we need to skip some indices i = 0 result = [] # Work on byte level with the utf-16le encoding to get the offsets right. # The offset will just be half the index we're at. message = add_surrogate(message) while i < len(message): m = delim_re.match(message, pos=i) # Did we find some delimiter here at `i`? if m: delim = next(filter(None, m.groups())) # +1 to avoid matching right after (e.g. "****") end = message.find(delim, i + len(delim) + 1) # Did we find the earliest closing tag? if end != -1: # Remove the delimiter from the string message = ''.join((message[:i], message[i + len(delim):end], message[end + len(delim):])) # Check other affected entities for ent in result: # If the end is after our start, it is affected if ent.offset + ent.length > i: # If the old start is also before ours, it is fully enclosed if ent.offset <= i: ent.length -= len(delim) * 2 else: ent.length -= len(delim) # Append the found entity ent = delimiters[delim] if ent == MessageEntityPre: result.append(ent(i, end - i - len(delim), '')) # has 'lang' else: result.append(ent(i, end - i - len(delim))) # No nested entities inside code blocks if ent in (MessageEntityCode, MessageEntityPre): i = end - len(delim) continue elif url_re: m = url_re.match(message, pos=i) if m: # Replace the whole match with only the inline URL text. message = ''.join( (message[:m.start()], m.group(1), message[m.end():])) delim_size = m.end() - m.start() - len(m.group()) for ent in result: # If the end is after our start, it is affected if ent.offset + ent.length > m.start(): ent.length -= delim_size result.append( MessageEntityTextUrl(offset=m.start(), length=len(m.group(1)), url=del_surrogate(m.group(2)))) i += len(m.group(1)) continue i += 1 message = strip_text(message, result) return del_surrogate(message), result