def soup2markup(self, soup: Any) -> List[Any]: # Ensure a string is provided, in case the soup finds none # This could occur if eg. an image is removed or not shown markup = [''] # type: List[Union[str, Tuple[Optional[str], Any]]] if soup is None: # This is not iterable, so return promptly return markup unrendered_tags = { # In pairs of 'tag_name': 'text' # TODO: Some of these could be implemented 'br': '', # No indicator of absence 'hr': 'RULER', 'img': 'IMAGE', } unrendered_div_classes = { # In pairs of 'div_class': 'text' # TODO: Support embedded content & twitter preview? 'message_embed': 'EMBEDDED CONTENT', 'inline-preview-twitter': 'TWITTER PREVIEW', 'message_inline_ref': '', # Duplicate of other content 'message_inline_image': '', # Duplicate of other content } unrendered_template = '[{} NOT RENDERED]' for element in soup: if isinstance(element, NavigableString): # NORMAL STRINGS if hasattr(self, 'bq_len') and element == '\n' and \ self.bq_len > 0: self.bq_len -= 1 continue markup.append(element) elif (element.name == 'div' and element.attrs and any(cls in element.attrs.get('class', []) for cls in unrendered_div_classes)): # UNRENDERED DIV CLASSES matching_class = (set(unrendered_div_classes) & set(element.attrs.get('class'))) text = unrendered_div_classes[matching_class.pop()] if text: markup.append(unrendered_template.format(text)) elif (element.name == 'img' and element.attrs.get('class', []) == ['emoji']): # CUSTOM EMOJIS AND ZULIP_EXTRA_EMOJI emoji_name = element.attrs.get('title', []) markup.append(('msg_emoji', ":"+emoji_name+":")) elif element.name in unrendered_tags: # UNRENDERED SIMPLE TAGS text = unrendered_tags[element.name] if text: markup.append(unrendered_template.format(text)) elif element.name in ('p', 'ul', 'del'): # PARAGRAPH, LISTS, STRIKE-THROUGH markup.extend(self.soup2markup(element)) elif (element.name == 'span' and element.attrs and 'emoji' in element.attrs.get('class', [])): # EMOJI markup.append(('msg_emoji', element.text)) elif (element.name == 'span' and element.attrs and ('katex-display' in element.attrs.get('class', []) or 'katex' in element.attrs.get('class', []))): # MATH TEXT markup.append(element.text) elif element.name == 'span' and element.attrs and\ ('user-mention' in element.attrs.get('class', []) or 'user-group-mention' in element.attrs.get('class', [])): # USER MENTIONS & USER-GROUP MENTIONS markup.append(('msg_mention', element.text)) elif element.name == 'a': # LINKS link = element.attrs['href'] text = element.img['src'] if element.img else element.text parsed_link = urlparse(link) if not parsed_link.scheme: # => relative link # Prepend org url to convert it to an absolute link link = urljoin(self.model.server_url, link) if link == text: # If the link and text are same # usually the case when user just pastes # a link then just display the link markup.append(('msg_link', text)) else: markup.append( ('msg_link', '[' + text + ']' + '(' + link + ')')) elif element.name == 'blockquote': # BLOCKQUOTE TEXT markup.append(( 'msg_quote', self.soup2markup(element) )) elif element.name == 'code': # CODE (INLINE?) markup.append(( 'msg_code', element.text )) elif element.name == 'div' and element.attrs and\ 'codehilite' in element.attrs.get('class', []): # CODE (BLOCK?) markup.append(( 'msg_code', element.text )) elif element.name in ('strong', 'em'): # BOLD & ITALIC markup.append(('msg_bold', element.text)) elif element.name == 'li': # LISTS # TODO: Support nested lists markup.append(' * ') markup.extend(self.soup2markup(element)) elif element.name == 'table': markup.extend(render_table(element)) else: markup.extend(self.soup2markup(element)) return markup
def soup2markup(self, soup: Any, **state: Any) -> List[Any]: # Ensure a string is provided, in case the soup finds none # This could occur if eg. an image is removed or not shown markup = [''] # type: List[Union[str, Tuple[Optional[str], Any]]] if soup is None: # This is not iterable, so return promptly return markup unrendered_tags = { # In pairs of 'tag_name': 'text' # TODO: Some of these could be implemented 'br': '', # No indicator of absence 'hr': 'RULER', 'img': 'IMAGE', } unrendered_div_classes = { # In pairs of 'div_class': 'text' # TODO: Support embedded content & twitter preview? 'message_embed': 'EMBEDDED CONTENT', 'inline-preview-twitter': 'TWITTER PREVIEW', 'message_inline_ref': '', # Duplicate of other content 'message_inline_image': '', # Duplicate of other content } unrendered_template = '[{} NOT RENDERED]' for element in soup: if isinstance(element, NavigableString): # NORMAL STRINGS if (hasattr(self, 'bq_len') and element == '\n' and self.bq_len > 0): self.bq_len -= 1 continue markup.append(element) elif (element.name == 'div' and element.attrs and any(cls in element.attrs.get('class', []) for cls in unrendered_div_classes)): # UNRENDERED DIV CLASSES matching_class = (set(unrendered_div_classes) & set(element.attrs.get('class'))) text = unrendered_div_classes[matching_class.pop()] if text: markup.append(unrendered_template.format(text)) elif (element.name == 'img' and element.attrs.get('class', []) == ['emoji']): # CUSTOM EMOJIS AND ZULIP_EXTRA_EMOJI emoji_name = element.attrs.get('title', []) markup.append(('msg_emoji', ":" + emoji_name + ":")) elif element.name in unrendered_tags: # UNRENDERED SIMPLE TAGS text = unrendered_tags[element.name] if text: markup.append(unrendered_template.format(text)) elif element.name in ('p', 'del'): # PARAGRAPH, STRIKE-THROUGH markup.extend(self.soup2markup(element)) elif (element.name == 'span' and element.attrs and 'emoji' in element.attrs.get('class', [])): # EMOJI markup.append(('msg_emoji', element.text)) elif (element.name == 'span' and element.attrs and ('katex-display' in element.attrs.get('class', []) or 'katex' in element.attrs.get('class', []))): # MATH TEXT markup.append(element.text) elif (element.name == 'span' and element.attrs and ('user-group-mention' in element.attrs.get('class', []) or 'user-mention' in element.attrs.get('class', []))): # USER MENTIONS & USER-GROUP MENTIONS markup.append(('msg_mention', element.text)) elif element.name == 'a': # LINKS # Use rstrip to avoid anomalies and edge cases like # https://google.com vs https://google.com/. link = element.attrs['href'].rstrip('/') text = element.img['src'] if element.img else element.text text = text.rstrip('/') parsed_link = urlparse(link) if not parsed_link.scheme: # => relative link # Prepend org url to convert it to an absolute link link = urljoin(self.model.server_url, link) text = text if text else link show_footlink = True # Only use the last segment if the text is redundant. # NOTE: The 'without scheme' excerpt is to deal with the case # where a user puts a link without any scheme and the server # uses http as the default scheme but keeps the text as-is. # For instance, see how example.com/some/path becomes # <a href="http://example.com">example.com/some/path</a>. link_without_scheme, text_without_scheme = [ data.split('://')[1] if '://' in data else data for data in [link, text] ] # Split on '://' is for cases where text == link. if link_without_scheme == text_without_scheme: segment = text.split('/')[-1] # Replace text with its last segment if the segment has # something significant than simply the 'domain name'. if segment != text_without_scheme: text = segment else: # Do not show as a footlink as the text is sufficient # to represent the link. show_footlink = False # Detect duplicate links to save screen real estate. if link not in self.message_links: self.message_links[link] = (text, len(self.message_links) + 1, show_footlink) else: # Append the text if its link already exist with a # different text. saved_text, saved_link_index, saved_footlink_status = ( self.message_links[link]) if saved_text != text: self.message_links[link] = ( '{}, {}'.format(saved_text, text), saved_link_index, show_footlink or saved_footlink_status, ) markup.extend([ ('msg_link', text), ' ', ('msg_link_index', '[{}]'.format(self.message_links[link][1])), ]) elif element.name == 'blockquote': # BLOCKQUOTE TEXT markup.append(('msg_quote', self.soup2markup(element))) elif element.name == 'code': # CODE (INLINE?) markup.append(('msg_code', element.text)) elif (element.name == 'div' and element.attrs and 'codehilite' in element.attrs.get('class', [])): # CODE (BLOCK?) markup.append(('msg_code', element.text)) elif element.name in ('strong', 'em'): # BOLD & ITALIC markup.append(('msg_bold', element.text)) elif element.name in ('ul', 'ol'): # LISTS (UL & OL) for part in element.contents: if part == '\n': part.replace_with('') if 'indent_level' not in state: state['indent_level'] = 1 state['list_start'] = True else: state['indent_level'] += 1 state['list_start'] = False if element.name == 'ol': start_number = int(element.attrs.get('start', 1)) state['list_index'] = start_number markup.extend(self.soup2markup(element, **state)) del state['list_index'] # reset at end of this list else: if 'list_index' in state: del state['list_index'] # this is unordered markup.extend(self.soup2markup(element, **state)) del state['indent_level'] # reset indents after any list elif element.name == 'li': # LIST ITEMS (LI) for part in element.contents: if part == '\n': part.replace_with('') if not state.get('list_start', False): markup.append('\n') indent = state.get('indent_level', 1) if 'list_index' in state: markup.append('{}{}. '.format(' ' * indent, state['list_index'])) state['list_index'] += 1 else: chars = [ '\N{BULLET}', '\N{RING OPERATOR}', # small hollow '\N{HYPHEN}', ] markup.append('{}{} '.format(' ' * indent, chars[(indent - 1) % 3])) state['list_start'] = False markup.extend(self.soup2markup(element, **state)) elif element.name == 'table': markup.extend(render_table(element)) else: markup.extend(self.soup2markup(element)) return markup