Python add_surrogate 예제들, telethon.helpers.add_surrogate Python 예제들

예제 #1

0

파일 보기

파일: tl.py 프로젝트: zeuslord224/uniborg

    async def tts(self, text, target=None):
        if self._need_refresh_tkk():
            async with self._tkk_lock:
                self._tkk = await self._fetch_tkk()

        parts = list(split_text(text))
        result = b''
        for i, part in enumerate(parts):
            params = [
                ('ie', 'UTF-8'),
                ('q', part),
                ('tl', target or self._target),
                ('total', len(parts)),
                ('idx', i),
                ('textlen', len(helpers.add_surrogate(part))),
                ('tk', self._calc_token(part)),
                ('client', 'webapp'),
                ('prev', 'input'),
            ]

            async with self._session.get(self._TRANSLATE_TTS_URL,
                                         params=params) as resp:
                if resp.status == 404:
                    raise ValueError('unknown target language')
                else:
                    result += await resp.read()

        return result

예제 #2

0

파일 보기

def unparse(text, entities, delimiters=None, url_fmt=None):
    """
    Performs the reverse operation to .parse(), effectively returning
    markdown-like syntax given a normal text and its MessageEntity's.
    :param text: the text to be reconverted into markdown.
    :param entities: the MessageEntity's applied to the text.
    :return: a markdown-like text representing the combination of both inputs.
    """
    if not text or not entities:
        return text

    if not delimiters:
        if delimiters is not None:
            return text
        delimiters = DEFAULT_DELIMITERS

    if url_fmt is not None:
        warnings.warn(
            'url_fmt is deprecated')  # since it complicates everything *a lot*

    if isinstance(entities, TLObject):
        entities = (entities, )

    text = add_surrogate(text)
    delimiters = {v: k for k, v in delimiters.items()}
    insert_at = []
    for entity in entities:
        s = entity.offset
        e = entity.offset + entity.length
        delimiter = delimiters.get(type(entity), None)
        if delimiter:
            insert_at.append((s, delimiter))
            insert_at.append((e, delimiter))
        else:
            url = None
            if isinstance(entity, MessageEntityTextUrl):
                url = entity.url
            elif isinstance(entity, MessageEntityMentionName):
                url = 'tg://user?id={}'.format(entity.user_id)
            if url:
                insert_at.append((s, '['))
                insert_at.append((e, ']({})'.format(url)))

    insert_at.sort(key=lambda t: t[0])
    while insert_at:
        at, what = insert_at.pop()

        # If we are in the middle of a surrogate nudge the position by +1.
        # Otherwise we would end up with malformed text and fail to encode.
        # For example of bad input: "Hi \ud83d\ude1c"
        # https://en.wikipedia.org/wiki/UTF-16#U+010000_to_U+10FFFF
        while at < len(text) and '\ud800' <= text[at] <= '\udfff':
            at += 1

        text = text[:at] + what + text[at:]

    return del_surrogate(text)

예제 #3

0

파일 보기

파일: from_telegram.py 프로젝트: tulir/mautrix-telegram

async def telegram_to_matrix(
    evt: Message | SponsoredMessage,
    source: au.AbstractUser,
    main_intent: IntentAPI | None = None,
    prefix_text: str | None = None,
    prefix_html: str | None = None,
    override_text: str = None,
    override_entities: list[TypeMessageEntity] = None,
    no_reply_fallback: bool = False,
    require_html: bool = False,
) -> TextMessageEventContent:
    content = TextMessageEventContent(
        msgtype=MessageType.TEXT,
        body=add_surrogate(override_text or evt.message),
    )
    entities = override_entities or evt.entities
    if entities:
        content.format = Format.HTML
        html = await _telegram_entities_to_matrix_catch(
            add_surrogate(content.body), entities)
        content.formatted_body = del_surrogate(html)

    if require_html:
        content.ensure_has_html()

    if prefix_html:
        content.ensure_has_html()
        content.formatted_body = prefix_html + content.formatted_body
    if prefix_text:
        content.body = prefix_text + content.body

    if getattr(evt, "fwd_from", None):
        await _add_forward_header(source, content, evt.fwd_from)

    if getattr(evt, "reply_to", None) and not no_reply_fallback:
        await _add_reply_header(source, content, evt, main_intent)

    if isinstance(evt, Message) and evt.post and evt.post_author:
        content.ensure_has_html()
        content.body += f"\n- {evt.post_author}"
        content.formatted_body += f"<br/><i>- <u>{evt.post_author}</u></i>"

    return content

예제 #4

0

파일 보기

def remove_code_and_mentions(message):
    content = list(add_surrogate(message.message))
    slices = []
    for ent, txt in message.get_entities_text():
        if isinstance(ent,
                      (types.MessageEntityCode, types.MessageEntityMention,
                       types.MessageEntityMentionName)):
            slices.append(slice(ent.offset, ent.offset + ent.length))
    for s in reversed(slices):
        del content[s]
    return del_surrogate(''.join(content))

예제 #5

0

파일 보기

파일: __init__.py 프로젝트: tulir/mautrix-telegram

async def _matrix_html_to_telegram(
        client: TelegramClient,
        html: str) -> tuple[str, list[TypeMessageEntity]]:
    try:
        html = command_regex.sub(r"<command>\1</command>", html)
        html = html.replace("\t", " " * 4)
        html = not_command_regex.sub(r"\1", html)

        parsed = await MatrixParser(client).parse(add_surrogate(html))
        text = del_surrogate(parsed.text.strip())
        text, entities = _cut_long_message(text, parsed.telegram_entities)

        return text, entities
    except Exception as e:
        raise FormatError(f"Failed to convert Matrix format: {html}") from e

예제 #6

0

파일 보기

파일: __init__.py 프로젝트: yncyrydybyl/mautrix-telegram

def matrix_to_telegram(html: str) -> ParsedMessage:
    try:
        html = command_regex.sub(r"<command>\1</command>", html)
        html = html.replace("\t", " " * 4)
        html = not_command_regex.sub(r"\1", html)
        if should_bridge_plaintext_highlights:
            html = plain_mention_regex.sub(plain_mention_to_html, html)

        text, entities = parse_html(add_surrogate(html))
        text = del_surrogate(text.strip())
        text, entities = cut_long_message(text, entities)

        return text, entities
    except Exception as e:
        raise FormatError(f"Failed to convert Matrix format: {html}") from e

예제 #7

0

파일 보기

async def _hacky_find_mention(
        evt: CommandEvent) -> TypeInputUser | TypeInputPeer | None:
    if len(evt.args) == 0:
        return None
    text, entities = await fmt.matrix_to_telegram(
        evt.sender.client,
        text=evt.content.body,
        html=evt.content.formatted_body)
    for entity in entities:
        if isinstance(entity, MessageEntityMention):
            admin_username = add_surrogate(text)[entity.offset +
                                                 1:entity.offset +
                                                 entity.length]
            return await evt.sender.client.get_input_entity(admin_username)
        elif isinstance(entity, InputMessageEntityMentionName):
            return entity.user_id
    return None

예제 #8

0

파일 보기

async def telegram_to_matrix(
        evt: Message,
        source: "AbstractUser",
        main_intent: Optional[IntentAPI] = None,
        prefix_text: Optional[str] = None,
        prefix_html: Optional[str] = None,
        override_text: str = None,
        override_entities: List[TypeMessageEntity] = None,
        no_reply_fallback: bool = False) -> TextMessageEventContent:
    content = TextMessageEventContent(
        msgtype=MessageType.TEXT,
        body=add_surrogate(override_text or evt.message),
    )
    entities = override_entities or evt.entities
    if entities:
        content.format = Format.HTML
        content.formatted_body = _telegram_entities_to_matrix_catch(
            content.body, entities)

    if prefix_html:
        if not content.formatted_body:
            content.format = Format.HTML
            content.formatted_body = escape(content.body)
        content.formatted_body = prefix_html + content.formatted_body
    if prefix_text:
        content.body = prefix_text + content.body

    if evt.fwd_from:
        await _add_forward_header(source, content, evt.fwd_from)

    if evt.reply_to_msg_id and not no_reply_fallback:
        await _add_reply_header(source, content, evt, main_intent)

    if isinstance(evt, Message) and evt.post and evt.post_author:
        if not content.formatted_body:
            content.formatted_body = escape(content.body)
        content.body += f"\n- {evt.post_author}"
        content.formatted_body += f"<br/><i>- <u>{evt.post_author}</u></i>"

    content.body = del_surrogate(content.body)

    if content.formatted_body:
        content.formatted_body = del_surrogate(
            content.formatted_body.replace("\n", "<br/>"))

    return content

예제 #9

0

파일 보기

def parse(message, delimiters=None, url_re=None):
    """
    Parses the given markdown message and returns its stripped representation
    plus a list of the MessageEntity's that were found.
    :param message: the message with markdown-like syntax to be parsed.
    :param delimiters: the delimiters to be used, {delimiter: type}.
    :param url_re: the URL bytes regex to be used. Must have two groups.
    :return: a tuple consisting of (clean message, [message entities]).
    """
    if not message:
        return message, []

    if url_re is None:
        url_re = DEFAULT_URL_RE
    elif isinstance(url_re, str):
        url_re = re.compile(url_re)

    if not delimiters:
        if delimiters is not None:
            return message, []
        delimiters = DEFAULT_DELIMITERS

    # Build a regex to efficiently test all delimiters at once.
    # Note that the largest delimiter should go first, we don't
    # want ``` to be interpreted as a single back-tick in a code block.
    delim_re = re.compile('|'.join(
        '({})'.format(re.escape(k))
        for k in sorted(delimiters, key=len, reverse=True)))

    # Cannot use a for loop because we need to skip some indices
    i = 0
    result = []

    # Work on byte level with the utf-16le encoding to get the offsets right.
    # The offset will just be half the index we're at.
    message = add_surrogate(message)
    while i < len(message):
        m = delim_re.match(message, pos=i)

        # Did we find some delimiter here at `i`?
        if m:
            delim = next(filter(None, m.groups()))

            # +1 to avoid matching right after (e.g. "****")
            end = message.find(delim, i + len(delim) + 1)

            # Did we find the earliest closing tag?
            if end != -1:

                # Remove the delimiter from the string
                message = ''.join((message[:i], message[i + len(delim):end],
                                   message[end + len(delim):]))

                # Check other affected entities
                for ent in result:
                    # If the end is after our start, it is affected
                    if ent.offset + ent.length > i:
                        # If the old start is also before ours, it is fully enclosed
                        if ent.offset <= i:
                            ent.length -= len(delim) * 2
                        else:
                            ent.length -= len(delim)

                # Append the found entity
                ent = delimiters[delim]
                if ent == MessageEntityPre:
                    result.append(ent(i, end - i - len(delim),
                                      ''))  # has 'lang'
                else:
                    result.append(ent(i, end - i - len(delim)))

                # No nested entities inside code blocks
                if ent in (MessageEntityCode, MessageEntityPre):
                    i = end - len(delim)

                continue

        elif url_re:
            m = url_re.match(message, pos=i)
            if m:
                # Replace the whole match with only the inline URL text.
                message = ''.join(
                    (message[:m.start()], m.group(1), message[m.end():]))

                delim_size = m.end() - m.start() - len(m.group())
                for ent in result:
                    # If the end is after our start, it is affected
                    if ent.offset + ent.length > m.start():
                        ent.length -= delim_size

                result.append(
                    MessageEntityTextUrl(offset=m.start(),
                                         length=len(m.group(1)),
                                         url=del_surrogate(m.group(2))))
                i += len(m.group(1))
                continue

        i += 1

    message = strip_text(message, result)
    return del_surrogate(message), result

예제 #10

0

파일 보기

 def code_converter(content):
     content = content.strip()
     return content, [
         types.MessageEntityCode(offset=0,
                                 length=len(add_surrogate(content)))
     ]

예제 #11

0

파일 보기

파일: utils.py 프로젝트: Rongronggg9/RSS-to-Telegram-Bot

def surrogate_len(s: str) -> int:
    return len(add_surrogate(s))

예제 #12

0

파일 보기

    def _process_text(self, params):
        if not self.source.text:
            return False

        append_from = self.fwd == FWD_APPEND
        fwd = None
        for att in reversed(self.attachments):
            if append_from and isinstance(att, Fwd):
                fwd = _type_in_list(reversed(self.attachments), Fwd)

                # Fwd.url is already resolved here
                self.source.entities.append(
                    types.MessageEntityUrl(
                        len(self.source.raw_text) + 2,
                        len(fwd.url) - 2))
                self.source.text += '\n\n' + fwd.url
                append_from = False
                continue

            if not isinstance(att, Url):
                continue

            if self.source.text == str(att.url):
                if att.title:
                    params['message'] = att.title
                return True

        if fwd:
            self.attachments.remove(fwd)

        text_urls = []
        for e, inner_text in self.source.get_entities_text():
            # NOTE no MessageEntityMentionName usage examples/documentation available
            # so assume it is same as MessageEntityMention
            if isinstance(
                    e,
                (types.MessageEntityMention, types.MessageEntityMentionName)):
                text_urls.append(
                    types.MessageEntityTextUrl(
                        e.offset, e.length, 'https://t.me/' + inner_text[1:]))
                continue
            if isinstance(e, types.MessageEntityTextUrl):
                text_urls.append(e)

        geo = _type_in_list(self.attachments, Geo)
        if geo:
            self.attachments.remove(geo)
            params['lat'] = geo.lat
            params['long'] = geo.long

        # if this is a rich text
        rich_page = _type_in_list(self.attachments, Page)
        if rich_page:
            params['message'] = rich_page.title
            return False

        if text_urls:
            # add_surrogate/del_surrogate are used by Telethon internally in
            # get_entities_text -> get_inner_text to get correct offsets in unicode
            raw_text = add_surrogate(self.source.raw_text)

            msg = []
            prev = 0
            for tu in text_urls:
                title = del_surrogate(raw_text[prev:(tu.offset + tu.length)])
                # link titles to telegraph photos look like \u200b\u200b
                if _ZERO_CHARS.match(title):
                    continue
                msg.append(title)
                msg.append(' (' + tu.url + ') ')
                prev = tu.offset + tu.length
            msg.append(del_surrogate(raw_text[prev:]))
            del raw_text

            params['message'] = ''.join(msg)
        else:
            params['message'] = self.source.raw_text

        return True

예제 #13

0

파일 보기

    def _process_rich_text(self):
        min_length = config.getint('xpost',
                                   'rich_text_min_length',
                                   fallback=256)
        is_rich = bool(self.source.entities) and \
            bool(_type_in_list(self.source.entities, (
                types.MessageEntityBold, types.MessageEntityItalic,
                types.MessageEntityPre, types.MessageEntityCode
                ))) and \
            len(self.source.raw_text) >= min_length

        if not is_rich:
            return False

        min_title_length = config.getint('xpost',
                                         'min_title_length',
                                         fallback=8)
        max_title_length = min_length // 4
        pos = min(p for p in (self.source.raw_text.find(
            '\n', min_title_length, max_title_length),
                              self.source.raw_text.find(
                                  '. ', min_title_length, max_title_length),
                              self.source.raw_text.find(
                                  ', ', min_title_length, max_title_length),
                              max_title_length - 3) if p != -1)

        title = self.source.raw_text[0:pos].strip() + '...'

        fmt_list = {
            types.MessageEntityBold: '<b>{0}</b>',
            types.MessageEntityItalic: '<i>{0}</i>',
            types.MessageEntityPre: '<pre>{0}</pre>',
            types.MessageEntityCode: '<code>{0}</code>',
            types.MessageEntityMention: '[https://t.me/{1}|{0}]',
            types.MessageEntityMentionName: '[https://t.me/{1}|{0}]',
            types.MessageEntityUrl: '[{0}]',
            types.MessageEntityTextUrl: '[{1}|{0}]'
        }

        # add_surrogate/del_surrogate are used by Telethon internally in
        # get_entities_text -> get_inner_text to get correct offsets in unicode
        raw_text = add_surrogate(self.source.raw_text)
        text = []
        prev = 0
        for e, et in self.source.get_entities_text():
            text.append(del_surrogate(raw_text[prev:e.offset]))
            ev = None
            # NOTE no MessageEntityMentionName usage examples/documentation available
            # so assume it is same as MessageEntityMention
            if isinstance(
                    e,
                (types.MessageEntityMention, types.MessageEntityMentionName)):
                ev = et[1:]
            elif isinstance(e, types.MessageEntityTextUrl):
                ev = e.url
            fmt = fmt_list.get(type(e), '{0}')
            text.append(del_surrogate(fmt.format(et, ev)))
            prev = e.offset + e.length
        text.append(del_surrogate(raw_text[prev:]))
        del raw_text

        self.attachments.append(
            Page(self.session, self.default_params, self.group_id, title,
                 ''.join(text), self.attachments))
        return True

예제 #14

0

파일 보기

파일: tl.py 프로젝트: zeuslord224/uniborg

    def _calc_token(self, text):
        """
        Original code by ultrafunkamsterdam/googletranslate:
        https://github.com/ultrafunkamsterdam/googletranslate/blob/bd3f4d0a1386ffa634c8ebbebb3603279f3ece99/googletranslate/__init__.py#L263

        If this ever breaks, the way it was found was in one of the top-100
        longest lines of `translate_m.js` used by translate.google.com, it
        uses a single-line with all these "magic" values and one can look
        around there and use a debugger to figure out how it works. It's
        a very straight-forward port.
        """
        def xor_rot(a, b):
            size_b = len(b)
            c = 0
            while c < size_b - 2:
                d = b[c + 2]
                d = ord(d[0]) - 87 if 'a' <= d else int(d)
                d = (a % 0x100000000) >> d if '+' == b[c + 1] else a << d
                a = a + d & 4294967295 if '+' == b[c] else a ^ d
                c += 3
            return a

        a = []
        text = helpers.add_surrogate(text)
        for i in text:
            val = ord(i)
            if val < 0x10000:
                a += [val]
            else:
                a += [
                    math.floor((val - 0x10000) / 0x400 + 0xD800),
                    math.floor((val - 0x10000) % 0x400 + 0xDC00),
                ]

        d = self._tkk
        b = d[0]
        e = []
        g = 0
        size = len(text)
        while g < size:
            l = a[g]
            if l < 128:
                e.append(l)
            else:
                if l < 2048:
                    e.append(l >> 6 | 192)
                else:
                    if ((l & 64512) == 55296 and g + 1 < size
                            and a[g + 1] & 64512 == 56320):
                        g += 1
                        l = 65536 + ((l & 1023) << 10) + (a[g] & 1023)
                        e.append(l >> 18 | 240)
                        e.append(l >> 12 & 63 | 128)
                    else:
                        e.append(l >> 12 | 224)
                    e.append(l >> 6 & 63 | 128)
                e.append(l & 63 | 128)
            g += 1
        a = b
        for i, value in enumerate(e):
            a += value
            a = xor_rot(a, '+-a^+6')
        a = xor_rot(a, '+-3^+b+-f')
        a ^= d[1]
        if a < 0:
            a = (a & 2147483647) + 2147483648
        a %= 1000000
        return '{}.{}'.format(a, a ^ b)