async def tts(self, text, target=None): if self._need_refresh_tkk(): async with self._tkk_lock: self._tkk = await self._fetch_tkk() parts = list(split_text(text)) result = b'' for i, part in enumerate(parts): params = [ ('ie', 'UTF-8'), ('q', part), ('tl', target or self._target), ('total', len(parts)), ('idx', i), ('textlen', len(helpers.add_surrogate(part))), ('tk', self._calc_token(part)), ('client', 'webapp'), ('prev', 'input'), ] async with self._session.get(self._TRANSLATE_TTS_URL, params=params) as resp: if resp.status == 404: raise ValueError('unknown target language') else: result += await resp.read() return result
def unparse(text, entities, delimiters=None, url_fmt=None): """ Performs the reverse operation to .parse(), effectively returning markdown-like syntax given a normal text and its MessageEntity's. :param text: the text to be reconverted into markdown. :param entities: the MessageEntity's applied to the text. :return: a markdown-like text representing the combination of both inputs. """ if not text or not entities: return text if not delimiters: if delimiters is not None: return text delimiters = DEFAULT_DELIMITERS if url_fmt is not None: warnings.warn( 'url_fmt is deprecated') # since it complicates everything *a lot* if isinstance(entities, TLObject): entities = (entities, ) text = add_surrogate(text) delimiters = {v: k for k, v in delimiters.items()} insert_at = [] for entity in entities: s = entity.offset e = entity.offset + entity.length delimiter = delimiters.get(type(entity), None) if delimiter: insert_at.append((s, delimiter)) insert_at.append((e, delimiter)) else: url = None if isinstance(entity, MessageEntityTextUrl): url = entity.url elif isinstance(entity, MessageEntityMentionName): url = 'tg://user?id={}'.format(entity.user_id) if url: insert_at.append((s, '[')) insert_at.append((e, ']({})'.format(url))) insert_at.sort(key=lambda t: t[0]) while insert_at: at, what = insert_at.pop() # If we are in the middle of a surrogate nudge the position by +1. # Otherwise we would end up with malformed text and fail to encode. # For example of bad input: "Hi \ud83d\ude1c" # https://en.wikipedia.org/wiki/UTF-16#U+010000_to_U+10FFFF while at < len(text) and '\ud800' <= text[at] <= '\udfff': at += 1 text = text[:at] + what + text[at:] return del_surrogate(text)
async def telegram_to_matrix( evt: Message | SponsoredMessage, source: au.AbstractUser, main_intent: IntentAPI | None = None, prefix_text: str | None = None, prefix_html: str | None = None, override_text: str = None, override_entities: list[TypeMessageEntity] = None, no_reply_fallback: bool = False, require_html: bool = False, ) -> TextMessageEventContent: content = TextMessageEventContent( msgtype=MessageType.TEXT, body=add_surrogate(override_text or evt.message), ) entities = override_entities or evt.entities if entities: content.format = Format.HTML html = await _telegram_entities_to_matrix_catch( add_surrogate(content.body), entities) content.formatted_body = del_surrogate(html) if require_html: content.ensure_has_html() if prefix_html: content.ensure_has_html() content.formatted_body = prefix_html + content.formatted_body if prefix_text: content.body = prefix_text + content.body if getattr(evt, "fwd_from", None): await _add_forward_header(source, content, evt.fwd_from) if getattr(evt, "reply_to", None) and not no_reply_fallback: await _add_reply_header(source, content, evt, main_intent) if isinstance(evt, Message) and evt.post and evt.post_author: content.ensure_has_html() content.body += f"\n- {evt.post_author}" content.formatted_body += f"<br/><i>- <u>{evt.post_author}</u></i>" return content
def remove_code_and_mentions(message): content = list(add_surrogate(message.message)) slices = [] for ent, txt in message.get_entities_text(): if isinstance(ent, (types.MessageEntityCode, types.MessageEntityMention, types.MessageEntityMentionName)): slices.append(slice(ent.offset, ent.offset + ent.length)) for s in reversed(slices): del content[s] return del_surrogate(''.join(content))
async def _matrix_html_to_telegram( client: TelegramClient, html: str) -> tuple[str, list[TypeMessageEntity]]: try: html = command_regex.sub(r"<command>\1</command>", html) html = html.replace("\t", " " * 4) html = not_command_regex.sub(r"\1", html) parsed = await MatrixParser(client).parse(add_surrogate(html)) text = del_surrogate(parsed.text.strip()) text, entities = _cut_long_message(text, parsed.telegram_entities) return text, entities except Exception as e: raise FormatError(f"Failed to convert Matrix format: {html}") from e
def matrix_to_telegram(html: str) -> ParsedMessage: try: html = command_regex.sub(r"<command>\1</command>", html) html = html.replace("\t", " " * 4) html = not_command_regex.sub(r"\1", html) if should_bridge_plaintext_highlights: html = plain_mention_regex.sub(plain_mention_to_html, html) text, entities = parse_html(add_surrogate(html)) text = del_surrogate(text.strip()) text, entities = cut_long_message(text, entities) return text, entities except Exception as e: raise FormatError(f"Failed to convert Matrix format: {html}") from e
async def _hacky_find_mention( evt: CommandEvent) -> TypeInputUser | TypeInputPeer | None: if len(evt.args) == 0: return None text, entities = await fmt.matrix_to_telegram( evt.sender.client, text=evt.content.body, html=evt.content.formatted_body) for entity in entities: if isinstance(entity, MessageEntityMention): admin_username = add_surrogate(text)[entity.offset + 1:entity.offset + entity.length] return await evt.sender.client.get_input_entity(admin_username) elif isinstance(entity, InputMessageEntityMentionName): return entity.user_id return None
async def telegram_to_matrix( evt: Message, source: "AbstractUser", main_intent: Optional[IntentAPI] = None, prefix_text: Optional[str] = None, prefix_html: Optional[str] = None, override_text: str = None, override_entities: List[TypeMessageEntity] = None, no_reply_fallback: bool = False) -> TextMessageEventContent: content = TextMessageEventContent( msgtype=MessageType.TEXT, body=add_surrogate(override_text or evt.message), ) entities = override_entities or evt.entities if entities: content.format = Format.HTML content.formatted_body = _telegram_entities_to_matrix_catch( content.body, entities) if prefix_html: if not content.formatted_body: content.format = Format.HTML content.formatted_body = escape(content.body) content.formatted_body = prefix_html + content.formatted_body if prefix_text: content.body = prefix_text + content.body if evt.fwd_from: await _add_forward_header(source, content, evt.fwd_from) if evt.reply_to_msg_id and not no_reply_fallback: await _add_reply_header(source, content, evt, main_intent) if isinstance(evt, Message) and evt.post and evt.post_author: if not content.formatted_body: content.formatted_body = escape(content.body) content.body += f"\n- {evt.post_author}" content.formatted_body += f"<br/><i>- <u>{evt.post_author}</u></i>" content.body = del_surrogate(content.body) if content.formatted_body: content.formatted_body = del_surrogate( content.formatted_body.replace("\n", "<br/>")) return content
def parse(message, delimiters=None, url_re=None): """ Parses the given markdown message and returns its stripped representation plus a list of the MessageEntity's that were found. :param message: the message with markdown-like syntax to be parsed. :param delimiters: the delimiters to be used, {delimiter: type}. :param url_re: the URL bytes regex to be used. Must have two groups. :return: a tuple consisting of (clean message, [message entities]). """ if not message: return message, [] if url_re is None: url_re = DEFAULT_URL_RE elif isinstance(url_re, str): url_re = re.compile(url_re) if not delimiters: if delimiters is not None: return message, [] delimiters = DEFAULT_DELIMITERS # Build a regex to efficiently test all delimiters at once. # Note that the largest delimiter should go first, we don't # want ``` to be interpreted as a single back-tick in a code block. delim_re = re.compile('|'.join( '({})'.format(re.escape(k)) for k in sorted(delimiters, key=len, reverse=True))) # Cannot use a for loop because we need to skip some indices i = 0 result = [] # Work on byte level with the utf-16le encoding to get the offsets right. # The offset will just be half the index we're at. message = add_surrogate(message) while i < len(message): m = delim_re.match(message, pos=i) # Did we find some delimiter here at `i`? if m: delim = next(filter(None, m.groups())) # +1 to avoid matching right after (e.g. "****") end = message.find(delim, i + len(delim) + 1) # Did we find the earliest closing tag? if end != -1: # Remove the delimiter from the string message = ''.join((message[:i], message[i + len(delim):end], message[end + len(delim):])) # Check other affected entities for ent in result: # If the end is after our start, it is affected if ent.offset + ent.length > i: # If the old start is also before ours, it is fully enclosed if ent.offset <= i: ent.length -= len(delim) * 2 else: ent.length -= len(delim) # Append the found entity ent = delimiters[delim] if ent == MessageEntityPre: result.append(ent(i, end - i - len(delim), '')) # has 'lang' else: result.append(ent(i, end - i - len(delim))) # No nested entities inside code blocks if ent in (MessageEntityCode, MessageEntityPre): i = end - len(delim) continue elif url_re: m = url_re.match(message, pos=i) if m: # Replace the whole match with only the inline URL text. message = ''.join( (message[:m.start()], m.group(1), message[m.end():])) delim_size = m.end() - m.start() - len(m.group()) for ent in result: # If the end is after our start, it is affected if ent.offset + ent.length > m.start(): ent.length -= delim_size result.append( MessageEntityTextUrl(offset=m.start(), length=len(m.group(1)), url=del_surrogate(m.group(2)))) i += len(m.group(1)) continue i += 1 message = strip_text(message, result) return del_surrogate(message), result
def code_converter(content): content = content.strip() return content, [ types.MessageEntityCode(offset=0, length=len(add_surrogate(content))) ]
def surrogate_len(s: str) -> int: return len(add_surrogate(s))
def _process_text(self, params): if not self.source.text: return False append_from = self.fwd == FWD_APPEND fwd = None for att in reversed(self.attachments): if append_from and isinstance(att, Fwd): fwd = _type_in_list(reversed(self.attachments), Fwd) # Fwd.url is already resolved here self.source.entities.append( types.MessageEntityUrl( len(self.source.raw_text) + 2, len(fwd.url) - 2)) self.source.text += '\n\n' + fwd.url append_from = False continue if not isinstance(att, Url): continue if self.source.text == str(att.url): if att.title: params['message'] = att.title return True if fwd: self.attachments.remove(fwd) text_urls = [] for e, inner_text in self.source.get_entities_text(): # NOTE no MessageEntityMentionName usage examples/documentation available # so assume it is same as MessageEntityMention if isinstance( e, (types.MessageEntityMention, types.MessageEntityMentionName)): text_urls.append( types.MessageEntityTextUrl( e.offset, e.length, 'https://t.me/' + inner_text[1:])) continue if isinstance(e, types.MessageEntityTextUrl): text_urls.append(e) geo = _type_in_list(self.attachments, Geo) if geo: self.attachments.remove(geo) params['lat'] = geo.lat params['long'] = geo.long # if this is a rich text rich_page = _type_in_list(self.attachments, Page) if rich_page: params['message'] = rich_page.title return False if text_urls: # add_surrogate/del_surrogate are used by Telethon internally in # get_entities_text -> get_inner_text to get correct offsets in unicode raw_text = add_surrogate(self.source.raw_text) msg = [] prev = 0 for tu in text_urls: title = del_surrogate(raw_text[prev:(tu.offset + tu.length)]) # link titles to telegraph photos look like \u200b\u200b if _ZERO_CHARS.match(title): continue msg.append(title) msg.append(' (' + tu.url + ') ') prev = tu.offset + tu.length msg.append(del_surrogate(raw_text[prev:])) del raw_text params['message'] = ''.join(msg) else: params['message'] = self.source.raw_text return True
def _process_rich_text(self): min_length = config.getint('xpost', 'rich_text_min_length', fallback=256) is_rich = bool(self.source.entities) and \ bool(_type_in_list(self.source.entities, ( types.MessageEntityBold, types.MessageEntityItalic, types.MessageEntityPre, types.MessageEntityCode ))) and \ len(self.source.raw_text) >= min_length if not is_rich: return False min_title_length = config.getint('xpost', 'min_title_length', fallback=8) max_title_length = min_length // 4 pos = min(p for p in (self.source.raw_text.find( '\n', min_title_length, max_title_length), self.source.raw_text.find( '. ', min_title_length, max_title_length), self.source.raw_text.find( ', ', min_title_length, max_title_length), max_title_length - 3) if p != -1) title = self.source.raw_text[0:pos].strip() + '...' fmt_list = { types.MessageEntityBold: '<b>{0}</b>', types.MessageEntityItalic: '<i>{0}</i>', types.MessageEntityPre: '<pre>{0}</pre>', types.MessageEntityCode: '<code>{0}</code>', types.MessageEntityMention: '[https://t.me/{1}|{0}]', types.MessageEntityMentionName: '[https://t.me/{1}|{0}]', types.MessageEntityUrl: '[{0}]', types.MessageEntityTextUrl: '[{1}|{0}]' } # add_surrogate/del_surrogate are used by Telethon internally in # get_entities_text -> get_inner_text to get correct offsets in unicode raw_text = add_surrogate(self.source.raw_text) text = [] prev = 0 for e, et in self.source.get_entities_text(): text.append(del_surrogate(raw_text[prev:e.offset])) ev = None # NOTE no MessageEntityMentionName usage examples/documentation available # so assume it is same as MessageEntityMention if isinstance( e, (types.MessageEntityMention, types.MessageEntityMentionName)): ev = et[1:] elif isinstance(e, types.MessageEntityTextUrl): ev = e.url fmt = fmt_list.get(type(e), '{0}') text.append(del_surrogate(fmt.format(et, ev))) prev = e.offset + e.length text.append(del_surrogate(raw_text[prev:])) del raw_text self.attachments.append( Page(self.session, self.default_params, self.group_id, title, ''.join(text), self.attachments)) return True
def _calc_token(self, text): """ Original code by ultrafunkamsterdam/googletranslate: https://github.com/ultrafunkamsterdam/googletranslate/blob/bd3f4d0a1386ffa634c8ebbebb3603279f3ece99/googletranslate/__init__.py#L263 If this ever breaks, the way it was found was in one of the top-100 longest lines of `translate_m.js` used by translate.google.com, it uses a single-line with all these "magic" values and one can look around there and use a debugger to figure out how it works. It's a very straight-forward port. """ def xor_rot(a, b): size_b = len(b) c = 0 while c < size_b - 2: d = b[c + 2] d = ord(d[0]) - 87 if 'a' <= d else int(d) d = (a % 0x100000000) >> d if '+' == b[c + 1] else a << d a = a + d & 4294967295 if '+' == b[c] else a ^ d c += 3 return a a = [] text = helpers.add_surrogate(text) for i in text: val = ord(i) if val < 0x10000: a += [val] else: a += [ math.floor((val - 0x10000) / 0x400 + 0xD800), math.floor((val - 0x10000) % 0x400 + 0xDC00), ] d = self._tkk b = d[0] e = [] g = 0 size = len(text) while g < size: l = a[g] if l < 128: e.append(l) else: if l < 2048: e.append(l >> 6 | 192) else: if ((l & 64512) == 55296 and g + 1 < size and a[g + 1] & 64512 == 56320): g += 1 l = 65536 + ((l & 1023) << 10) + (a[g] & 1023) e.append(l >> 18 | 240) e.append(l >> 12 & 63 | 128) else: e.append(l >> 12 | 224) e.append(l >> 6 & 63 | 128) e.append(l & 63 | 128) g += 1 a = b for i, value in enumerate(e): a += value a = xor_rot(a, '+-a^+6') a = xor_rot(a, '+-3^+b+-f') a ^= d[1] if a < 0: a = (a & 2147483647) + 2147483648 a %= 1000000 return '{}.{}'.format(a, a ^ b)