def getImageTemplates(self, name, wikidb=None): from mwlib.expander import get_templates page = self.get_image_description_page(name) if page is not None: return get_templates(page.rawtext) print 'no such image: %r' % name return []
def getImageTemplates(self, name, wikidb=None): from mwlib.expander import get_templates page = self.get_image_description_page(name) if page is not None: return get_templates(page.rawtext) print "no such image: %r" % name return []
def process_templates(self, actual_title, s): """Prints the templates into the second output file.""" from mwlib.expander import get_templates self.templates_f.write(u"{0} {1}\n".format(WikipediaParser.page_separator, actual_title).encode("utf-8")) if not WikitextToConll.write_templates_into_f(s, self.templates_f): sys.stderr.write(u"Problems with templates and mwparser: {0}\n".format(actual_title).encode("utf-8")) return get_templates(s)
def process_templates(self, actual_title, s): """Prints the templates into the second output file.""" from mwlib.expander import get_templates self.templates_f.write(u"{0} {1}\n".format( WikipediaParser.page_separator, actual_title).encode("utf-8")) if not WikitextToConll.write_templates_into_f(s, self.templates_f): sys.stderr.write(u"Problems with templates and mwparser: {0}\n".format( actual_title).encode("utf-8")) return get_templates(s)
def getContributorsFromInformationTemplate(raw, title, wikidb): from mwlib.expander import find_template, get_templates, get_template_args, Expander from mwlib import uparser, parser, advtree def getUserLinks(raw): def isUserLink(node): return isinstance(node, parser.NamespaceLink) and node.namespace == 2 # NS_USER result = list(set([ u.target for u in uparser.parseString(title, raw=raw, wikidb=wikidb, ).filter(isUserLink) ])) result.sort() return result def get_authors_from_template_args(template): args = get_template_args(template, expander) author_arg = args.get('Author', None) if author_arg: userlinks = getUserLinks(author_arg) if userlinks: return userlinks node = uparser.parseString('', raw=args['Author'], wikidb=wikidb) advtree.extendClasses(node) txt = node.getAllDisplayText().strip() if txt: return [txt] if args.args: return getUserLinks('\n'.join([args.get(i, u'') for i in range(len(args.args))])) return [] expander = Expander(u'', title, wikidb) template = find_template(raw, 'Information') if template is not None: authors = get_authors_from_template_args(template) if authors: return authors authors = [] for template in get_templates(raw): t = find_template(raw, template) if t is not None: authors.extend(get_authors_from_template_args(t)) if authors: return authors return getUserLinks(raw)
def getContributorsFromInformationTemplate(raw, title, wikidb): from mwlib.expander import find_template, get_templates, get_template_args, Expander from mwlib import uparser, parser, advtree from mwlib.templ.parser import parse def getUserLinks(raw): def isUserLink(node): return isinstance(node, parser.NamespaceLink) and node.namespace == 2 # NS_USER result = list(set([ u.target for u in uparser.parseString(title, raw=raw, wikidb=wikidb, ).filter(isUserLink) ])) result.sort() return result def get_authors_from_template_args(template): args = get_template_args(template, expander) author_arg = args.get('Author', None) if author_arg: # userlinks = getUserLinks(author_arg) # if userlinks: # return userlinks node = uparser.parseString('', raw=args['Author'], wikidb=wikidb) advtree.extendClasses(node) txt = node.getAllDisplayText().strip() if txt: return [txt] if args.args: return getUserLinks('\n'.join([args.get(i, u'') for i in range(len(args.args))])) return [] expander = Expander(u'', title, wikidb) parsed_raw = [parse(raw, replace_tags=expander.replace_tags)] template = find_template(None, 'Information', parsed_raw[:]) if template is not None: authors = get_authors_from_template_args(template) if authors: return authors authors = [] for template in get_templates(raw): t = find_template(None, template, parsed_raw[:]) if t is not None: authors.extend(get_authors_from_template_args(t)) if authors: return authors return getUserLinks(raw)
def getContributorsFromInformationTemplate(raw, title, wikidb): from mwlib.expander import find_template, get_templates, get_template_args, Expander from mwlib import uparser, parser, advtree from mwlib.templ.parser import parse def getUserLinks(raw): def isUserLink(node): return isinstance(node, parser.NamespaceLink) and node.namespace == 2 # NS_USER result = list(set([u.target for u in uparser.parseString(title, raw=raw, wikidb=wikidb).filter(isUserLink)])) result.sort() return result def get_authors_from_template_args(template): args = get_template_args(template, expander) author_arg = args.get("Author", None) if author_arg: # userlinks = getUserLinks(author_arg) # if userlinks: # return userlinks node = uparser.parseString("", raw=args["Author"], wikidb=wikidb) advtree.extendClasses(node) txt = node.getAllDisplayText().strip() if txt: return [txt] if args.args: return getUserLinks("\n".join([args.get(i, u"") for i in range(len(args.args))])) return [] expander = Expander(u"", title, wikidb) parsed_raw = [parse(raw, replace_tags=expander.replace_tags)] template = find_template(None, "Information", parsed_raw[:]) if template is not None: authors = get_authors_from_template_args(template) if authors: return authors authors = [] for template in get_templates(raw): t = find_template(None, template, parsed_raw[:]) if t is not None: authors.extend(get_authors_from_template_args(t)) if authors: return authors return getUserLinks(raw)
def getImageTemplatesAndArgs(self, name, wikidb=None): from mwlib.expander import get_templates, get_template_args page = self.get_image_description_page(name) if page is not None: templates = get_templates(page.rawtext) from mwlib.expander import find_template from mwlib.templ.evaluate import Expander from mwlib.templ.parser import parse from mwlib.templ.misc import DictDB args = set() e = Expander('', wikidb=DictDB()) # avoid parsing with every call to find_template parsed_raw = [parse(page.rawtext, replace_tags=e.replace_tags)] for t in templates: tmpl = find_template(None, t, parsed_raw[:]) arg_list = tmpl[1] for arg in arg_list: if isinstance(arg, basestring) and len(arg) > 3 and ' ' not in arg: args.add(arg) templates.update(args) return templates return []
def doit(source, expected): r = expander.get_templates(source, u'') assert r == expected, "expected %r, got %r" % (expected, r)
def join(self): """Finish ZIP file by writing the actual content""" if self.status: self.status(status=u'fetching articles') self.fetcharticle_status = self.status.getSubRange(0, 20) self.fetchtemplate_status = self.status.getSubRange(21, 40) self.parse_status = self.status.getSubRange(41, 60) self.fetchimages_status = self.status.getSubRange(61, 100) else: self.fetcharticle_status = self.fetchtemplate_status = self.parse_status = self.fetchimages_status = None for info in self.article_jobs: self.fetchArticle( title=info['title'], revision=info['revision'], wikidb=info['wikidb'], ) self.jobsched.join() if self.status: self.status(status=u'fetching templates', article='') templates = set() for info in self.article_jobs: try: raw = self.articles[info['title']]['content'] except KeyError: continue for name in expander.get_templates(raw, info['title']): templates.add((name, info['wikidb'])) self.num_templates = len(templates) self.template_count = 0 for title, wikidb in templates: self.fetchTemplate(title, wikidb) self.jobsched.join() if self.status: self.status(status=u'parsing articles') n = len(self.article_jobs) for i, info in enumerate(self.article_jobs): try: raw = self.articles[info['title']]['content'] except KeyError: continue if self.parse_status: self.parse_status(article=info['title']) self.parseArticle( title=info['title'], revision=info['revision'], raw=raw, wikidb=info['wikidb'], imagedb=info['imagedb'], ) if self.parse_status: self.parse_status(progress=i*100/n) if self.status: self.status(status=u'fetching images', article='') self.num_images = len(self.image_infos) self.image_count = 0 for i in self.image_infos: self.addImage(*i) self.jobsched.join() self.addObject('content.json', json.dumps(dict( articles=self.articles, templates=self.templates, sources=self.sources, images=self.images, )))