def test_template_in_and_after_nowiki(self): # The template should not be handled if inside <nowiki> text = 'Lorem <nowiki>{{ipsum}}</nowiki>{{ipsum}}' xml = preprocessToXml(text) correct_xml = '<root>Lorem <nowiki>{{ipsum}}</nowiki>' \ + '<template><title>ipsum</title></template></root>' self.assertEqual(xml, correct_xml)
def test_tag_unclosed(self): tpl = 'ipsum' text = 'Lorem <div>' xml = preprocessToXml('%s{{%s}}' % (text, tpl)) correct_xml = '<root>%s' % htmlspecialchars(text) \ + '<template><title>%s</title></template></root>' % tpl self.assertEqual(xml, correct_xml)
def test_template_with_argument_unicode(self): name = 'ølipsum' arg = 'ål€en' xml = preprocessToXml('{{%s|%s}}' % (name, arg)) correct_xml = '<root><template><title>%s</title>' % name \ + '<part><name index="1" /><value>%s</value></part>' % arg \ + '</template></root>' self.assertEqual(xml, correct_xml)
def test_template_with_tplarg(self): # A quite comlicated example, with a tplarg as the template name. # This will also involve re-adding stack elements back into the # stack after the }}} text = 'LLorem ipsum {{{{{Domino}}} | est = infinitus }}' xml = preprocessToXml(text) correct_xml = '<root>LLorem ipsum <template><title>' \ + '<tplarg><title>Domino</title></tplarg> </title>' \ + '<part><name> est </name>=<value> infinitus </value></part>' \ + '</template></root>' self.assertEqual(xml, correct_xml)
def test_nested_templates(self): args = ('Unus', 'Duo', 'Infinitas') text = '{{%s|{{%s|{{%s}}}}}}' % args xml = preprocessToXml('%s' % text) val = lambda name, val: '<template><title>%s</title>' % name \ + {'': ''}.get(val, '<part><name index="1" /><value>%s</value></part>' % val) + '</template>' correct_xml = '<root>%s</root>' \ % val(args[0], val(args[1], val(args[2], ''))) self.assertEqual(xml, correct_xml)
def get_body_text(text): xml = preprocessToXml(text) xml = xml.replace('<', '<').replace('>', '>') root = fromstring(condition_for_lxml(xml)) out = '' if root.text: out += root.text for child in root.iterchildren(): if child.tail: out += child.tail # Strip tables buf = [] depth = 0 cpos = 0 while True: openpos = out.find('{|', cpos) closepos = out.find('|}', cpos) if openpos == -1 and closepos == -1: break elif openpos == -1: current = {'mark': 'close', 'pos': closepos} elif closepos == -1: current = {'mark': 'open', 'pos': openpos} else: if openpos < closepos: current = {'mark': 'open', 'pos': openpos} else: current = {'mark': 'close', 'pos': closepos} if current['mark'] == 'open': if depth == 0: buf.append(out[cpos:current['pos']]) cpos = current['pos'] + 2 depth += 1 else: cpos = current['pos'] + 2 depth -= 1 if depth == 0: buf.append(out[cpos:]) out = ''.join(buf) out = re.sub(r'==[=]*', '', out) # drop header markers (but keep header text) out = re.sub(r"''[']*", '', out) # drop bold/italic markers (but keep text) # Note that re.sub has no flags support in python2.6, which is why we use re.compile rec1 = re.compile(r'^(?:#|\*).*?$', flags=re.MULTILINE) # drop lists altogether out = rec1.sub('', out) out = re.sub(r'\[\[Kategori:[^\]]+\]\]', '', out) # drop categories out = re.sub(r'(?<!\[)\[(?!\[)[^ ]+ [^\]]+\]', '', out) # drop external links out = re.sub(r'\[\[(?:[^:|\]]+\|)?([^:\]]+)\]\]', '\\1', out) # wikilinks as text, '[[Artikkel 1|artikkelen]]' -> 'artikkelen' out = re.sub(r'\[\[(?:Fil|File|Image|Bilde):[^\]]+\|([^\]]+)\]\]', '\\1', out) # image descriptions only out = re.sub(r'\[\[[A-Za-z\-]+:[^\]]+\]\]', '', out) # drop interwikis exclude = set(string.punctuation) out = ' '.join(ch for ch in out.split() if ch not in exclude) return out
def test_link(self): text = 'Lorem [[ipsum]]' xml = preprocessToXml('%s' % text) correct_xml = '<root>%s</root>' % text self.assertEqual(xml, correct_xml)
def test_simple_template_unicode(self): name = 'Lårem øpsum' xml = preprocessToXml('{{%s}}' % name) correct_xml = '<root><template><title>%s</title>' % name \ + '</template></root>' self.assertEqual(xml, correct_xml)
def test_linebreak(self): # Make sure preprocessor does not eat linebreaks text = '\n' xml = preprocessToXml(text) correct_xml = '<root>%s</root>' % text self.assertEqual(xml, correct_xml)
def test_simple(self): text = 'Lorem ipsum' xml = preprocessToXml(text) correct_xml = '<root>%s</root>' % text self.assertEqual(xml, correct_xml)
def test_nonmatching_braces3(self): text = '{{Lorem{{{ipsum}}dolor' xml = preprocessToXml(text) correct_xml = '<root>{{Lorem{<template><title>ipsum</title>' \ + '</template>dolor</root>' self.assertEqual(xml, correct_xml)
def test_html_tags2(self): text = 'Lorem<b>ipsum</b> ipsam' xml = preprocessToXml(text) correct_xml = '<root>Lorem<b>ipsum</b> ipsam</root>' self.assertEqual(xml, correct_xml)
def test_template_in_math(self): # The template should not be handled if inside <math> text = 'Lorem <math>{{ipsum}}</math>' xml = preprocessToXml('%s' % text) correct_xml = '<root>%s</root>' % htmlspecialchars(text) self.assertEqual(xml, correct_xml)
def test_template_in_comment(self): # The template should not be handled if inside a comment text = 'Lorem <!--{{ipsum}}-->' xml = preprocessToXml('%s' % text) correct_xml = '<root>%s</root>' % htmlspecialchars(text) self.assertEqual(xml, correct_xml)
def test_unclosed_template2(self): # Leaving out both end braces name = '{{Lorem ipsum' xml = preprocessToXml('%s' % name) correct_xml = '<root>%s</root>' % name self.assertEqual(xml, correct_xml)
def test_comment_unclosed(self): text = 'Lorem <!-- ipsum ' xml = preprocessToXml('%s' % text) correct_xml = '<root>%s</root>' % htmlspecialchars(text) self.assertEqual(xml, correct_xml)