def parseString(title=None, raw=None, wikidb=None, revision=None, lang=None, magicwords=None, expandTemplates=True): """parse article with title from raw mediawiki text""" uniquifier = None siteinfo = None assert title is not None, 'no title given' if raw is None: page = wikidb.normalize_and_get_page(title, 0) if page: raw = page.rawtext else: raw = None assert raw is not None, "cannot get article %r" % (title,) input_raw = raw te = None if wikidb: if expandTemplates: te = expander.Expander(raw, pagename=title, wikidb=wikidb) input_raw = te.expandTemplates(True) uniquifier = te.uniquifier if hasattr(wikidb, 'get_siteinfo'): siteinfo = wikidb.get_siteinfo() src = None if hasattr(wikidb, 'getSource'): src = wikidb.getSource(title, revision=revision) assert not isinstance(src, dict) if not src: src = metabook.source() if lang is None: lang = src.language if magicwords is None: if siteinfo is not None and 'magicwords' in siteinfo: magicwords = siteinfo['magicwords'] else: magicwords = src.get('magicwords') if siteinfo is None: nshandler = nshandling.get_nshandler_for_lang(lang) else: nshandler = nshandling.nshandler(siteinfo) a = compat.parse_txt(input_raw, title=title, wikidb=wikidb, nshandler=nshandler, lang=lang, magicwords=magicwords, uniquifier=uniquifier, expander=te) a.caption = title if te and te.magic_displaytitle: a.caption = te.magic_displaytitle from mwlib.old_uparser import postprocessors for x in postprocessors: x(a, title=title, revision=revision, wikidb=wikidb, lang=lang) return a
def parse_txt(txt, xopts=None, **kwargs): if xopts is None: xopts = XBunch(**kwargs) else: xopts.__dict__.update(**kwargs) if xopts.expander is None: from mwlib.expander import Expander, DictDB xopts.expander = Expander("", "pagename", wikidb=DictDB()) if xopts.nshandler is None: xopts.nshandler = nshandling.get_nshandler_for_lang(xopts.lang or 'en') xopts.imagemod = util.ImageMod(xopts.magicwords) uniquifier = xopts.uniquifier if uniquifier is None: uniquifier = uniq.Uniquifier() txt = uniquifier.replace_tags(txt) xopts.uniquifier = uniquifier tokens = tokenize(txt, uniquifier=uniquifier) td2 = tagparser() a = td2.add a("code", 10) a("span", 20) a("li", 25, blocknode=True, nested=False) a("dl", 28, blocknode=True) a("dt", 26, blocknode=True, nested=False) a("dd", 26, blocknode=True, nested=True) td1 = tagparser() a = td1.add a("blockquote", 5) a("references", 15) a("p", 30, blocknode=True, nested=False) a("ul", 35, blocknode=True) a("ol", 40, blocknode=True) a("center", 45, blocknode=True) td_parse_h = tagparser() for i in range(1, 7): td_parse_h.add("h%s" % i, i) parsers = [ fixlitags, mark_style_tags, parse_singlequote, parse_preformatted, td2, parse_paragraphs, td1, parse_lines, parse_div, parse_links, parse_urls, parse_inputbox, td_parse_h, parse_sections, remove_table_garbage, fix_tables, parse_tables, parse_uniq, fix_named_url_double_brackets, fix_break_between_pre ] combined_parser(parsers)(tokens, xopts) return tokens
class nuwiki(object): def __init__(self, path, allow_pickle=False): self.path = os.path.abspath(path) d = os.path.join(self.path, "images", "safe") if not os.path.exists(d): try: os.makedirs(d) except OSError, exc: if exc.errno != 17: # file exists raise self.excluded = set(x.get("title") for x in self._loadjson("excluded.json", [])) self.revisions = {} self._read_revisions() fn = os.path.join(self.path, 'authors.db') if not os.path.exists(fn): self.authors = None log.warn('no authors present. parsing revision info instead') else: self.authors = DumbJsonDB(fn, allow_pickle=allow_pickle) fn = os.path.join(self.path, 'html.db') if not os.path.exists(fn): self.html = self.extractHTML(self._loadjson("parsed_html.json", {})) log.warn('no html present. parsing revision info instead') else: self.html = DumbJsonDB(fn, allow_pickle=allow_pickle) fn = os.path.join(self.path, 'imageinfo.db') if not os.path.exists(fn): self.imageinfo = self._loadjson("imageinfo.json", {}) log.warn('loading imageinfo from pickle') else: self.imageinfo = DumbJsonDB(fn, allow_pickle=allow_pickle) self.redirects = self._loadjson("redirects.json", {}) self.siteinfo = self._loadjson("siteinfo.json", {}) self.nshandler = nshandling.nshandler(self.siteinfo) self.en_nshandler = nshandling.get_nshandler_for_lang('en') self.nfo = self._loadjson("nfo.json", {}) self.set_make_print_template()
def __init__(self, path, allow_pickle=False): self.path = os.path.abspath(path) d = os.path.join(self.path, "images", "safe") if not os.path.exists(d): try: os.makedirs(d) except OSError as exc: if exc.errno != 17: # file exists raise self.excluded = set(x.get("title") for x in self._loadjson("excluded.json", [])) self.revisions = {} self._read_revisions() fn = os.path.join(self.path, 'authors.db') if not os.path.exists(fn): self.authors = None log.warn('no authors present. parsing revision info instead') else: self.authors = DumbJsonDB(fn, allow_pickle=allow_pickle) fn = os.path.join(self.path, 'html.db') if not os.path.exists(fn): self.html = self.extractHTML(self._loadjson("parsed_html.json", {})) log.warn('no html present. parsing revision info instead') else: self.html = DumbJsonDB(fn, allow_pickle=allow_pickle) fn = os.path.join(self.path, 'imageinfo.db') if not os.path.exists(fn): self.imageinfo = self._loadjson("imageinfo.json", {}) log.warn('loading imageinfo from pickle') else: self.imageinfo = DumbJsonDB(fn, allow_pickle=allow_pickle) self.redirects = self._loadjson("redirects.json", {}) self.siteinfo = self._loadjson("siteinfo.json", {}) self.nshandler = nshandling.nshandler(self.siteinfo) self.en_nshandler = nshandling.get_nshandler_for_lang('en') self.nfo = self._loadjson("nfo.json", {}) self.set_make_print_template()
def empty(): empty = core.XBunch() empty.nshandler = nshandling.get_nshandler_for_lang('de') return empty
def _get_nshandler(self): if self._nshandler is not None: return self._nshandler return nshandling.get_nshandler_for_lang('en') # FIXME
def parseString(title=None, raw=None, wikidb=None, revision=None, lang=None, magicwords=None, expandTemplates=True): """parse article with title from raw mediawiki text""" uniquifier = None siteinfo = None assert title is not None, 'no title given' if raw is None: page = wikidb.normalize_and_get_page(title, 0) if page: raw = page.rawtext else: raw = None assert raw is not None, "cannot get article %r" % (title, ) input = raw te = None if wikidb: if expandTemplates: te = expander.Expander(raw, pagename=title, wikidb=wikidb) input = te.expandTemplates(True) uniquifier = te.uniquifier if hasattr(wikidb, 'get_siteinfo'): siteinfo = wikidb.get_siteinfo() src = None if hasattr(wikidb, 'getSource'): src = wikidb.getSource(title, revision=revision) assert not isinstance(src, dict) if not src: src = metabook.source() if lang is None: lang = src.language if magicwords is None: if siteinfo is not None and 'magicwords' in siteinfo: magicwords = siteinfo['magicwords'] else: magicwords = src.get('magicwords') if siteinfo is None: nshandler = nshandling.get_nshandler_for_lang(lang) else: nshandler = nshandling.nshandler(siteinfo) a = compat.parse_txt(input, title=title, wikidb=wikidb, nshandler=nshandler, lang=lang, magicwords=magicwords, uniquifier=uniquifier, expander=te) a.caption = title if te and te.magic_displaytitle: a.caption = te.magic_displaytitle from mwlib.old_uparser import postprocessors for x in postprocessors: x(a, title=title, revision=revision, wikidb=wikidb, lang=lang) return a
def parse_txt(txt, xopts=None, **kwargs): if xopts is None: xopts = XBunch(**kwargs) else: xopts.__dict__.update(**kwargs) if xopts.expander is None: from mwlib.expander import Expander, DictDB xopts.expander = Expander("", "pagename", wikidb=DictDB()) if xopts.nshandler is None: xopts.nshandler = nshandling.get_nshandler_for_lang(xopts.lang or 'en') xopts.imagemod = util.ImageMod(xopts.magicwords) uniquifier = xopts.uniquifier if uniquifier is None: uniquifier = uniq.Uniquifier() txt = uniquifier.replace_tags(txt) xopts.uniquifier = uniquifier tokens = tokenize(txt, uniquifier=uniquifier) td2 = tagparser() a = td2.add a("code", 10) a("span", 20) a("li", 25, blocknode=True, nested=False) a("dl", 28, blocknode=True) a("dt", 26, blocknode=True, nested=False) a("dd", 26, blocknode=True, nested=True) td1 = tagparser() a = td1.add a("blockquote", 5) a("references", 15) a("p", 30, blocknode=True, nested=False) a("ul", 35, blocknode=True) a("ol", 40, blocknode=True) a("center", 45, blocknode=True) td_parse_h = tagparser() for i in range(1, 7): td_parse_h.add("h%s" % i, i) parsers = [fixlitags, mark_style_tags, parse_singlequote, parse_preformatted, td2, parse_paragraphs, td1, parse_lines, parse_div, parse_links, parse_urls, parse_inputbox, td_parse_h, parse_sections, remove_table_garbage, fix_tables, parse_tables, parse_uniq, fix_named_url_double_brackets, fix_break_between_pre] combined_parser(parsers)(tokens, xopts) return tokens
def test_localized_redirect_matcher(): m = nshandling.get_nshandler_for_lang("de").redirect_matcher assert m("#REDIRECT [[Data structure]]") == "Data structure", "bad redirect" assert m("#WEITERLEITUNG [[Data structure]]") == "Data structure", "bad redirect"
def test_redirect_matcher(): m = nshandling.get_nshandler_for_lang("en").redirect_matcher assert m("#REDIRECT [[Data structure#Active data structures]]") == "Data structure", "bad redirect"
def test_redirect_matcher(): m = nshandling.get_nshandler_for_lang("en").redirect_matcher assert m("#REDIRECT [[Data structure#Active data structures]]" ) == "Data structure", "bad redirect"
def test_localized_redirect_matcher(): m = nshandling.get_nshandler_for_lang("de").redirect_matcher assert m( "#REDIRECT [[Data structure]]") == "Data structure", "bad redirect" assert m("#WEITERLEITUNG [[Data structure]]" ) == "Data structure", "bad redirect"