def __init__(self, features=()): self.paragraph_handler = BlockElementHandler('unstyled') self.element_handlers = HTMLRuleset({ 'p': self.paragraph_handler }) for feature in features: rule = feature_registry.get_converter_rule('contentstate', feature) if rule is not None: self.element_handlers.add_rules(rule['from_database_format']) super().__init__()
def test_precedence(self): ruleset = HTMLRuleset() ruleset.add_rule('p', 'normal-paragraph') ruleset.add_rule('p[class="intro"]', 'intro-paragraph') ruleset.add_rule('p', 'normal-paragraph-again') self.assertEqual(ruleset.match('p', {'class': 'intro'}), 'intro-paragraph')
def test_precedence(self): ruleset = HTMLRuleset() ruleset.add_rule("p", "normal-paragraph") ruleset.add_rule('p[class="intro"]', "intro-paragraph") ruleset.add_rule("p", "normal-paragraph-again") self.assertEqual(ruleset.match("p", {"class": "intro"}), "intro-paragraph")
def __init__(self, features=()): self.paragraph_handler = BlockElementHandler("unstyled") self.element_handlers = HTMLRuleset( { "p": self.paragraph_handler, "br": LineBreakHandler(), } ) for feature in features: rule = feature_registry.get_converter_rule("contentstate", feature) if rule is not None: self.element_handlers.add_rules(rule["from_database_format"]) super().__init__(convert_charrefs=True)
def test_html_ruleset(self): ruleset = HTMLRuleset({ 'p': 'paragraph', 'a[href]': 'link', 'a[linktype=page]': 'page-link', 'a[linktype="silly page"]': 'silly-page-link', "a[linktype='sensible page']": 'sensible-page-link', }) self.assertEqual(ruleset.match('div', {}), None) self.assertEqual(ruleset.match('p', {}), 'paragraph') self.assertEqual(ruleset.match('p', {'class': 'intro'}), 'paragraph') self.assertEqual(ruleset.match('a', {'class': 'button'}), None) self.assertEqual(ruleset.match('a', {'class': 'button', 'href': 'http://wagtail.io'}), 'link') self.assertEqual(ruleset.match('a', {'class': 'button', 'linktype': 'document'}), None) self.assertEqual(ruleset.match('a', {'class': 'button', 'linktype': 'page'}), 'page-link') self.assertEqual(ruleset.match('a', {'class': 'button', 'linktype': 'silly page'}), 'silly-page-link') self.assertEqual(ruleset.match('a', {'class': 'button', 'linktype': 'sensible page'}), 'sensible-page-link')
def __init__(self, features=()): self.paragraph_handler = BlockElementHandler('unstyled') self.element_handlers = HTMLRuleset({ 'p': self.paragraph_handler, 'br': LineBreakHandler(), }) for feature in features: rule = feature_registry.get_converter_rule('contentstate', feature) if rule is not None: self.element_handlers.add_rules(rule['from_database_format']) super().__init__(convert_charrefs=True)
class HtmlToContentStateHandler(HTMLParser): def __init__(self, features=()): self.paragraph_handler = BlockElementHandler('unstyled') self.element_handlers = HTMLRuleset({ 'p': self.paragraph_handler, 'br': LineBreakHandler(), }) for feature in features: rule = feature_registry.get_converter_rule('contentstate', feature) if rule is not None: self.element_handlers.add_rules(rule['from_database_format']) super().__init__(convert_charrefs=True) def reset(self): self.state = HandlerState() self.contentstate = ContentState() # stack of (name, handler) tuples for the elements we're currently inside self.open_elements = [] super().reset() def handle_starttag(self, name, attrs): attr_dict = dict( attrs) # convert attrs from list of (name, value) tuples to a dict element_handler = self.element_handlers.match(name, attr_dict) if element_handler is None and not self.open_elements: # treat unrecognised top-level elements as paragraphs element_handler = self.paragraph_handler self.open_elements.append((name, element_handler)) if element_handler: element_handler.handle_starttag(name, attrs, self.state, self.contentstate) def handle_endtag(self, name): if not self.open_elements: return # avoid a pop from an empty list if we have an extra end tag expected_name, element_handler = self.open_elements.pop() assert name == expected_name, "Unmatched tags: expected %s, got %s" % ( expected_name, name) if element_handler: element_handler.handle_endtag(name, self.state, self.contentstate) def handle_data(self, content): # normalise whitespace sequences to a single space content = re.sub(WHITESPACE_RE, ' ', content) if self.state.current_block is None: if content == ' ': # ignore top-level whitespace return else: # create a new paragraph block for this content add_paragraph_block(self.state, self.contentstate) if content == ' ': # if leading_whitespace = strip, this whitespace node is not significant # and should be skipped. # For other cases, _don't_ output the whitespace yet, but set leading_whitespace = force # so that a space is forced before the next text node or inline element. If no such node # appears (= we reach the end of the block), the whitespace can rightfully be dropped. if self.state.leading_whitespace != STRIP_WHITESPACE: self.state.leading_whitespace = FORCE_WHITESPACE else: # strip or add leading whitespace according to the leading_whitespace flag if self.state.leading_whitespace == STRIP_WHITESPACE: content = content.lstrip() elif self.state.leading_whitespace == FORCE_WHITESPACE and not content.startswith( ' '): content = ' ' + content if content.endswith(' '): # don't output trailing whitespace yet, because we want to discard it if the end # of the block follows. Instead, we'll set leading_whitespace = force so that # any following text or inline element will be prefixed by a space content = content.rstrip() self.state.leading_whitespace = FORCE_WHITESPACE else: # no trailing whitespace here - any leading whitespace at the start of the # next text node should be respected self.state.leading_whitespace = KEEP_WHITESPACE self.state.current_block.text += content def close(self): # if content ends in an atomic block (or is empty), need to append a spacer paragraph if not self.state.has_preceding_nonatomic_block: add_paragraph_block(self.state, self.contentstate) super().close()
class HtmlToContentStateHandler(HTMLParser): def __init__(self, features=()): self.paragraph_handler = BlockElementHandler('unstyled') self.element_handlers = HTMLRuleset({ 'p': self.paragraph_handler, 'br': LineBreakHandler(), }) for feature in features: rule = feature_registry.get_converter_rule('contentstate', feature) if rule is not None: self.element_handlers.add_rules(rule['from_database_format']) super().__init__(convert_charrefs=True) def reset(self): self.state = HandlerState() self.contentstate = ContentState() # stack of (name, handler) tuples for the elements we're currently inside self.open_elements = [] super().reset() def handle_starttag(self, name, attrs): attr_dict = dict(attrs) # convert attrs from list of (name, value) tuples to a dict element_handler = self.element_handlers.match(name, attr_dict) if element_handler is None and not self.open_elements: # treat unrecognised top-level elements as paragraphs element_handler = self.paragraph_handler self.open_elements.append((name, element_handler)) if element_handler: element_handler.handle_starttag(name, attrs, self.state, self.contentstate) def handle_endtag(self, name): expected_name, element_handler = self.open_elements.pop() assert name == expected_name, "Unmatched tags: expected %s, got %s" % (expected_name, name) if element_handler: element_handler.handle_endtag(name, self.state, self.contentstate) def handle_data(self, content): # normalise whitespace sequences to a single space content = re.sub(WHITESPACE_RE, ' ', content) if self.state.current_block is None: if content == ' ': # ignore top-level whitespace return else: # create a new paragraph block for this content add_paragraph_block(self.state, self.contentstate) if content == ' ': # if leading_whitespace = strip, this whitespace node is not significant # and should be skipped. # For other cases, _don't_ output the whitespace yet, but set leading_whitespace = force # so that a space is forced before the next text node or inline element. If no such node # appears (= we reach the end of the block), the whitespace can rightfully be dropped. if self.state.leading_whitespace != STRIP_WHITESPACE: self.state.leading_whitespace = FORCE_WHITESPACE else: # strip or add leading whitespace according to the leading_whitespace flag if self.state.leading_whitespace == STRIP_WHITESPACE: content = content.lstrip() elif self.state.leading_whitespace == FORCE_WHITESPACE and not content.startswith(' '): content = ' ' + content if content.endswith(' '): # don't output trailing whitespace yet, because we want to discard it if the end # of the block follows. Instead, we'll set leading_whitespace = force so that # any following text or inline element will be prefixed by a space content = content.rstrip() self.state.leading_whitespace = FORCE_WHITESPACE else: # no trailing whitespace here - any leading whitespace at the start of the # next text node should be respected self.state.leading_whitespace = KEEP_WHITESPACE self.state.current_block.text += content def close(self): # if content ends in an atomic block (or is empty), need to append a spacer paragraph if not self.state.has_preceding_nonatomic_block: add_paragraph_block(self.state, self.contentstate) super().close()
def test_html_ruleset(self): ruleset = HTMLRuleset({ 'p': 'paragraph', 'a[href]': 'link', 'a[linktype=page]': 'page-link', 'a[linktype="silly page"]': 'silly-page-link', "a[linktype='sensible page']": 'sensible-page-link', }) self.assertEqual(ruleset.match('div', {}), None) self.assertEqual(ruleset.match('p', {}), 'paragraph') self.assertEqual(ruleset.match('p', {'class': 'intro'}), 'paragraph') self.assertEqual(ruleset.match('a', {'class': 'button'}), None) self.assertEqual( ruleset.match('a', { 'class': 'button', 'href': 'http://wagtail.io' }), 'link') self.assertEqual( ruleset.match('a', { 'class': 'button', 'linktype': 'document' }), None) self.assertEqual( ruleset.match('a', { 'class': 'button', 'linktype': 'page' }), 'page-link') self.assertEqual( ruleset.match('a', { 'class': 'button', 'linktype': 'silly page' }), 'silly-page-link') self.assertEqual( ruleset.match('a', { 'class': 'button', 'linktype': 'sensible page' }), 'sensible-page-link')
def test_html_ruleset(self): ruleset = HTMLRuleset({ "p": "paragraph", "a[href]": "link", "a[linktype=page]": "page-link", 'a[linktype="silly page"]': "silly-page-link", "a[linktype='sensible page']": "sensible-page-link", }) self.assertIsNone(ruleset.match("div", {})) self.assertEqual(ruleset.match("p", {}), "paragraph") self.assertEqual(ruleset.match("p", {"class": "intro"}), "paragraph") self.assertIsNone(ruleset.match("a", {"class": "button"})) self.assertEqual( ruleset.match("a", { "class": "button", "href": "http://wagtail.org" }), "link", ) self.assertIsNone( ruleset.match("a", { "class": "button", "linktype": "document" })) self.assertEqual( ruleset.match("a", { "class": "button", "linktype": "page" }), "page-link") self.assertEqual( ruleset.match("a", { "class": "button", "linktype": "silly page" }), "silly-page-link", ) self.assertEqual( ruleset.match("a", { "class": "button", "linktype": "sensible page" }), "sensible-page-link", )