Exemplo n.º 1
0
async def telegram_to_matrix(
        evt: Message,
        source: "AbstractUser",
        main_intent: Optional[IntentAPI] = None,
        prefix_text: Optional[str] = None,
        prefix_html: Optional[str] = None,
        override_text: str = None,
        override_entities: List[TypeMessageEntity] = None,
        no_reply_fallback: bool = False) -> TextMessageEventContent:
    content = TextMessageEventContent(
        msgtype=MessageType.TEXT,
        body=add_surrogate(override_text or evt.message),
    )
    entities = override_entities or evt.entities
    if entities:
        content.format = Format.HTML
        content.formatted_body = _telegram_entities_to_matrix_catch(
            content.body, entities)

    if prefix_html:
        if not content.formatted_body:
            content.format = Format.HTML
            content.formatted_body = escape(content.body)
        content.formatted_body = prefix_html + content.formatted_body
    if prefix_text:
        content.body = prefix_text + content.body

    if evt.fwd_from:
        await _add_forward_header(source, content, evt.fwd_from)

    if evt.reply_to_msg_id and not no_reply_fallback:
        await _add_reply_header(source, content, evt, main_intent)

    if isinstance(evt, Message) and evt.post and evt.post_author:
        if not content.formatted_body:
            content.formatted_body = escape(content.body)
        content.body += f"\n- {evt.post_author}"
        content.formatted_body += f"<br/><i>- <u>{evt.post_author}</u></i>"

    content.body = del_surrogate(content.body)

    if content.formatted_body:
        content.formatted_body = del_surrogate(
            content.formatted_body.replace("\n", "<br/>"))

    return content
Exemplo n.º 2
0
def unparse(text, entities, delimiters=None, url_fmt=None):
    """
    Performs the reverse operation to .parse(), effectively returning
    markdown-like syntax given a normal text and its MessageEntity's.
    :param text: the text to be reconverted into markdown.
    :param entities: the MessageEntity's applied to the text.
    :return: a markdown-like text representing the combination of both inputs.
    """
    if not text or not entities:
        return text

    if not delimiters:
        if delimiters is not None:
            return text
        delimiters = DEFAULT_DELIMITERS

    if url_fmt is not None:
        warnings.warn(
            'url_fmt is deprecated')  # since it complicates everything *a lot*

    if isinstance(entities, TLObject):
        entities = (entities, )

    text = add_surrogate(text)
    delimiters = {v: k for k, v in delimiters.items()}
    insert_at = []
    for entity in entities:
        s = entity.offset
        e = entity.offset + entity.length
        delimiter = delimiters.get(type(entity), None)
        if delimiter:
            insert_at.append((s, delimiter))
            insert_at.append((e, delimiter))
        else:
            url = None
            if isinstance(entity, MessageEntityTextUrl):
                url = entity.url
            elif isinstance(entity, MessageEntityMentionName):
                url = 'tg://user?id={}'.format(entity.user_id)
            if url:
                insert_at.append((s, '['))
                insert_at.append((e, ']({})'.format(url)))

    insert_at.sort(key=lambda t: t[0])
    while insert_at:
        at, what = insert_at.pop()

        # If we are in the middle of a surrogate nudge the position by +1.
        # Otherwise we would end up with malformed text and fail to encode.
        # For example of bad input: "Hi \ud83d\ude1c"
        # https://en.wikipedia.org/wiki/UTF-16#U+010000_to_U+10FFFF
        while at < len(text) and '\ud800' <= text[at] <= '\udfff':
            at += 1

        text = text[:at] + what + text[at:]

    return del_surrogate(text)
Exemplo n.º 3
0
def remove_code_and_mentions(message):
    content = list(add_surrogate(message.message))
    slices = []
    for ent, txt in message.get_entities_text():
        if isinstance(ent,
                      (types.MessageEntityCode, types.MessageEntityMention,
                       types.MessageEntityMentionName)):
            slices.append(slice(ent.offset, ent.offset + ent.length))
    for s in reversed(slices):
        del content[s]
    return del_surrogate(''.join(content))
Exemplo n.º 4
0
async def _matrix_html_to_telegram(
        client: TelegramClient,
        html: str) -> tuple[str, list[TypeMessageEntity]]:
    try:
        html = command_regex.sub(r"<command>\1</command>", html)
        html = html.replace("\t", " " * 4)
        html = not_command_regex.sub(r"\1", html)

        parsed = await MatrixParser(client).parse(add_surrogate(html))
        text = del_surrogate(parsed.text.strip())
        text, entities = _cut_long_message(text, parsed.telegram_entities)

        return text, entities
    except Exception as e:
        raise FormatError(f"Failed to convert Matrix format: {html}") from e
Exemplo n.º 5
0
def matrix_to_telegram(html: str) -> ParsedMessage:
    try:
        html = command_regex.sub(r"<command>\1</command>", html)
        html = html.replace("\t", " " * 4)
        html = not_command_regex.sub(r"\1", html)
        if should_bridge_plaintext_highlights:
            html = plain_mention_regex.sub(plain_mention_to_html, html)

        text, entities = parse_html(add_surrogate(html))
        text = del_surrogate(text.strip())
        text, entities = cut_long_message(text, entities)

        return text, entities
    except Exception as e:
        raise FormatError(f"Failed to convert Matrix format: {html}") from e
Exemplo n.º 6
0
async def telegram_to_matrix(
    evt: Message | SponsoredMessage,
    source: au.AbstractUser,
    main_intent: IntentAPI | None = None,
    prefix_text: str | None = None,
    prefix_html: str | None = None,
    override_text: str = None,
    override_entities: list[TypeMessageEntity] = None,
    no_reply_fallback: bool = False,
    require_html: bool = False,
) -> TextMessageEventContent:
    content = TextMessageEventContent(
        msgtype=MessageType.TEXT,
        body=add_surrogate(override_text or evt.message),
    )
    entities = override_entities or evt.entities
    if entities:
        content.format = Format.HTML
        html = await _telegram_entities_to_matrix_catch(
            add_surrogate(content.body), entities)
        content.formatted_body = del_surrogate(html)

    if require_html:
        content.ensure_has_html()

    if prefix_html:
        content.ensure_has_html()
        content.formatted_body = prefix_html + content.formatted_body
    if prefix_text:
        content.body = prefix_text + content.body

    if getattr(evt, "fwd_from", None):
        await _add_forward_header(source, content, evt.fwd_from)

    if getattr(evt, "reply_to", None) and not no_reply_fallback:
        await _add_reply_header(source, content, evt, main_intent)

    if isinstance(evt, Message) and evt.post and evt.post_author:
        content.ensure_has_html()
        content.body += f"\n- {evt.post_author}"
        content.formatted_body += f"<br/><i>- <u>{evt.post_author}</u></i>"

    return content
Exemplo n.º 7
0
def parse(message, delimiters=None, url_re=None):
    """
    Parses the given markdown message and returns its stripped representation
    plus a list of the MessageEntity's that were found.
    :param message: the message with markdown-like syntax to be parsed.
    :param delimiters: the delimiters to be used, {delimiter: type}.
    :param url_re: the URL bytes regex to be used. Must have two groups.
    :return: a tuple consisting of (clean message, [message entities]).
    """
    if not message:
        return message, []

    if url_re is None:
        url_re = DEFAULT_URL_RE
    elif isinstance(url_re, str):
        url_re = re.compile(url_re)

    if not delimiters:
        if delimiters is not None:
            return message, []
        delimiters = DEFAULT_DELIMITERS

    # Build a regex to efficiently test all delimiters at once.
    # Note that the largest delimiter should go first, we don't
    # want ``` to be interpreted as a single back-tick in a code block.
    delim_re = re.compile('|'.join(
        '({})'.format(re.escape(k))
        for k in sorted(delimiters, key=len, reverse=True)))

    # Cannot use a for loop because we need to skip some indices
    i = 0
    result = []

    # Work on byte level with the utf-16le encoding to get the offsets right.
    # The offset will just be half the index we're at.
    message = add_surrogate(message)
    while i < len(message):
        m = delim_re.match(message, pos=i)

        # Did we find some delimiter here at `i`?
        if m:
            delim = next(filter(None, m.groups()))

            # +1 to avoid matching right after (e.g. "****")
            end = message.find(delim, i + len(delim) + 1)

            # Did we find the earliest closing tag?
            if end != -1:

                # Remove the delimiter from the string
                message = ''.join((message[:i], message[i + len(delim):end],
                                   message[end + len(delim):]))

                # Check other affected entities
                for ent in result:
                    # If the end is after our start, it is affected
                    if ent.offset + ent.length > i:
                        # If the old start is also before ours, it is fully enclosed
                        if ent.offset <= i:
                            ent.length -= len(delim) * 2
                        else:
                            ent.length -= len(delim)

                # Append the found entity
                ent = delimiters[delim]
                if ent == MessageEntityPre:
                    result.append(ent(i, end - i - len(delim),
                                      ''))  # has 'lang'
                else:
                    result.append(ent(i, end - i - len(delim)))

                # No nested entities inside code blocks
                if ent in (MessageEntityCode, MessageEntityPre):
                    i = end - len(delim)

                continue

        elif url_re:
            m = url_re.match(message, pos=i)
            if m:
                # Replace the whole match with only the inline URL text.
                message = ''.join(
                    (message[:m.start()], m.group(1), message[m.end():]))

                delim_size = m.end() - m.start() - len(m.group())
                for ent in result:
                    # If the end is after our start, it is affected
                    if ent.offset + ent.length > m.start():
                        ent.length -= delim_size

                result.append(
                    MessageEntityTextUrl(offset=m.start(),
                                         length=len(m.group(1)),
                                         url=del_surrogate(m.group(2))))
                i += len(m.group(1))
                continue

        i += 1

    message = strip_text(message, result)
    return del_surrogate(message), result
Exemplo n.º 8
0
    def _process_text(self, params):
        if not self.source.text:
            return False

        append_from = self.fwd == FWD_APPEND
        fwd = None
        for att in reversed(self.attachments):
            if append_from and isinstance(att, Fwd):
                fwd = _type_in_list(reversed(self.attachments), Fwd)

                # Fwd.url is already resolved here
                self.source.entities.append(
                    types.MessageEntityUrl(
                        len(self.source.raw_text) + 2,
                        len(fwd.url) - 2))
                self.source.text += '\n\n' + fwd.url
                append_from = False
                continue

            if not isinstance(att, Url):
                continue

            if self.source.text == str(att.url):
                if att.title:
                    params['message'] = att.title
                return True

        if fwd:
            self.attachments.remove(fwd)

        text_urls = []
        for e, inner_text in self.source.get_entities_text():
            # NOTE no MessageEntityMentionName usage examples/documentation available
            # so assume it is same as MessageEntityMention
            if isinstance(
                    e,
                (types.MessageEntityMention, types.MessageEntityMentionName)):
                text_urls.append(
                    types.MessageEntityTextUrl(
                        e.offset, e.length, 'https://t.me/' + inner_text[1:]))
                continue
            if isinstance(e, types.MessageEntityTextUrl):
                text_urls.append(e)

        geo = _type_in_list(self.attachments, Geo)
        if geo:
            self.attachments.remove(geo)
            params['lat'] = geo.lat
            params['long'] = geo.long

        # if this is a rich text
        rich_page = _type_in_list(self.attachments, Page)
        if rich_page:
            params['message'] = rich_page.title
            return False

        if text_urls:
            # add_surrogate/del_surrogate are used by Telethon internally in
            # get_entities_text -> get_inner_text to get correct offsets in unicode
            raw_text = add_surrogate(self.source.raw_text)

            msg = []
            prev = 0
            for tu in text_urls:
                title = del_surrogate(raw_text[prev:(tu.offset + tu.length)])
                # link titles to telegraph photos look like \u200b\u200b
                if _ZERO_CHARS.match(title):
                    continue
                msg.append(title)
                msg.append(' (' + tu.url + ') ')
                prev = tu.offset + tu.length
            msg.append(del_surrogate(raw_text[prev:]))
            del raw_text

            params['message'] = ''.join(msg)
        else:
            params['message'] = self.source.raw_text

        return True
Exemplo n.º 9
0
    def _process_rich_text(self):
        min_length = config.getint('xpost',
                                   'rich_text_min_length',
                                   fallback=256)
        is_rich = bool(self.source.entities) and \
            bool(_type_in_list(self.source.entities, (
                types.MessageEntityBold, types.MessageEntityItalic,
                types.MessageEntityPre, types.MessageEntityCode
                ))) and \
            len(self.source.raw_text) >= min_length

        if not is_rich:
            return False

        min_title_length = config.getint('xpost',
                                         'min_title_length',
                                         fallback=8)
        max_title_length = min_length // 4
        pos = min(p for p in (self.source.raw_text.find(
            '\n', min_title_length, max_title_length),
                              self.source.raw_text.find(
                                  '. ', min_title_length, max_title_length),
                              self.source.raw_text.find(
                                  ', ', min_title_length, max_title_length),
                              max_title_length - 3) if p != -1)

        title = self.source.raw_text[0:pos].strip() + '...'

        fmt_list = {
            types.MessageEntityBold: '<b>{0}</b>',
            types.MessageEntityItalic: '<i>{0}</i>',
            types.MessageEntityPre: '<pre>{0}</pre>',
            types.MessageEntityCode: '<code>{0}</code>',
            types.MessageEntityMention: '[https://t.me/{1}|{0}]',
            types.MessageEntityMentionName: '[https://t.me/{1}|{0}]',
            types.MessageEntityUrl: '[{0}]',
            types.MessageEntityTextUrl: '[{1}|{0}]'
        }

        # add_surrogate/del_surrogate are used by Telethon internally in
        # get_entities_text -> get_inner_text to get correct offsets in unicode
        raw_text = add_surrogate(self.source.raw_text)
        text = []
        prev = 0
        for e, et in self.source.get_entities_text():
            text.append(del_surrogate(raw_text[prev:e.offset]))
            ev = None
            # NOTE no MessageEntityMentionName usage examples/documentation available
            # so assume it is same as MessageEntityMention
            if isinstance(
                    e,
                (types.MessageEntityMention, types.MessageEntityMentionName)):
                ev = et[1:]
            elif isinstance(e, types.MessageEntityTextUrl):
                ev = e.url
            fmt = fmt_list.get(type(e), '{0}')
            text.append(del_surrogate(fmt.format(et, ev)))
            prev = e.offset + e.length
        text.append(del_surrogate(raw_text[prev:]))
        del raw_text

        self.attachments.append(
            Page(self.session, self.default_params, self.group_id, title,
                 ''.join(text), self.attachments))
        return True