def make_parsers(): templates = {} allowed_tags = [] allowed_self_closing_tags = [] allowed_attributes = [] interwiki = {} namespaces = {} from mediawiki_parser.preprocessor import make_parser preprocessor = make_parser(templates) from mediawiki_parser.html import make_parser parser = make_parser(allowed_tags, allowed_self_closing_tags, allowed_attributes, interwiki, namespaces, internal_link_prefix=WIKI_BASE_URI) return preprocessor, parser
def _grammar(self, templates): """Return a full or partial grammar. method_name -- If truthy, the attribute of the full grammar to return """ return preprocessor.make_parser(templates)
def mediawiki(pad): templates = {} allowed_tags = [] allowed_self_closing_tags = [] allowed_attributes = [] interwiki = {} namespaces = {} preprocessor = make_parser(templates) parser = make_html_parser(allowed_tags, allowed_self_closing_tags, allowed_attributes, interwiki, namespaces) preprocessed_text = preprocessor.parse(pad.content.decode("Utf-8")) return parser.parse(preprocessed_text.leaves()).value.replace("<body>", "").replace("</body>", "")
def testit(content): global foo templates = {} allowed_tags = ["PRE"] allowed_self_closing_tags = [] allowed_attributes = [] interwiki = {} namespaces = {} preprocess = preprocessor.make_parser(templates) parser = html.make_parser(allowed_tags, allowed_self_closing_tags, allowed_attributes, interwiki, namespaces) #parser._setTopPattern('wikitext') #parser = raw.make_parser() preprocessed_text = preprocess.parseTest(content) #import pdb; pdb.set_trace() #Pattern.TRACE=True foo = parser.parseTest(preprocessed_text).leaves()
def _preprocessor(self, templates): return preprocessor.make_parser(templates)
allowed_autoclose_tags = ['br', 'hr'] allowed_parameters = ['class', 'style', 'name', 'id', 'scope'] interwiki = { 'en': 'http://en.wikipedia.org/wiki/', 'fr': 'http://fr.wikipedia.org/wiki/' } namespaces = { 'Template': 10, u'Catégorie': 14, 'Category': 14, 'File': 6, 'Image': 6 } parser = html.make_parser(allowed_tags, allowed_autoclose_tags, allowed_parameters, interwiki, namespaces) preprocessor_parser = preprocessor.make_parser({}) siteSubElem = lxml.html.fromstring( '<div class="siteSub">From Fakipedia, the fake Wikipedia</div><div class="contentSub"/>' ) def preprocess(source): source = source.replace("\n ", "\n") \ .replace(" \n", "\n") \ .replace("= ", "=") \ .replace(" =", "=") \ .replace("@ ", " ") \ .replace(" @", " ") \ .strip() source_split = source.split("\n") # fixing title
if line == "": break while line[0] == " ": line = line[1:] if line == "</page>\n": temp_page += line n += 1 pages.append(temp_page) # w = Wikipedia(temp_page) elif line == "<page>\n": temp_page = line else: temp_page += line return pages # 3176788 pages if __name__ == "__main__": fn = "zhwiki" data = chunky_read(fn) dics = [] for item in data: json = dumps(bf.data(fromstring(item))) dic = loads(json) dics.append(dic) d = dics[2] text = d["page"]["revision"]["text"]["$"] templates = {} preprocessor = make_parser(templates) output = preprocessor.parse(text)
def parse_data_to_markup(source, dest, format_='yaml', template='standard_entry.md.jinja'): """Given the path to a source data file and a destination, turn the source file into a Python dictionary and then pass it to a Jinja template, writing to the destination. Args: source (file): File-like object to read and parse data from. dest (file): File-like object to write the rendered template to. Kwargs: format (string): What format the source file is in. Default assumption is `yaml`. template (string): Name of the template we should read and then render. """ data = None if format_ == 'yaml': with open(source, 'r') as f: data = yaml.load(f) elif format_ == 'hjson': import hjson with open(source, 'r') as f: data = hjson.load(f) elif format_ == 'cfg': # config parser needs the most... massging config = ConfigParser.RawConfigParser() config.read(source) data = config.items('trip') data = dict(map(lambda x: (x[0], x[1].replace('\\n', '\n')), data)) guests = map(lambda x: x[1], config.items('guests')) data['guest_list'] = guests elif format_ == 'plist': import plistlib data = plistlib.readPlist(source) elif format_ == 'wiki': from mediawiki_parser.html import make_parser as make_parser_html from mediawiki_parser.preprocessor import make_parser preprocessor = make_parser({}) parser = make_parser_html([], [], [], {}, {}) with open(source, 'r') as f: preprocessed_text = preprocessor.parse(f.read()) output = parser.parse(preprocessed_text.leaves()) dest.write(output.value) return else: raise RuntimeError("No usable format given to data parser!") loader = jinja2.FileSystemLoader('tools/templates') env = jinja2.Environment(loader=loader) template = env.get_template(template) data['source'] = source dest.write(template.render(**data))