def getTemplate(self, title, followRedirects=True): ns, partial, full = namespace.splitname(title, defaultns=namespace.NS_TEMPLATE) try: return self.docs[(ns, partial)] except KeyError: pass return self.db.getTemplate(title, followRedirects=followRedirects)
def getRawArticle(self, title, revision=None): ns, partial, full = namespace.splitname(title) try: return self.docs[(ns, partial)] except KeyError: pass return self.db.getRawArticle(title, revision=revision)
def getRawArticle(self, name, revision=None): r = self.db.getRawArticle(name, revision=revision) if r is None: return None ns, partial, full = namespace.splitname(name) if ns==namespace.NS_TEMPLATE: self.templates[partial] = { 'content-type': 'text/x-wiki', 'content': r, } return r self.articles[name] = { 'revision': revision, 'content-type': 'text/x-wiki', 'content': r, 'url': self.db.getURL(name, revision=revision), 'authors': self.db.getAuthors(name, revision=revision), } if hasattr(self.db, 'getSource'): src = self.db.getSource(name, revision=revision) if src and 'url' in src: self.articles[name]['source-url'] = src['url'] if src['url'] not in self.sources: self.sources[src['url']] = src return r
def getRawArticle(self, title, revision=None): ns, partial, full = namespace.splitname(title) if ns==namespace.NS_TEMPLATE: return self.getTemplate(partial) article = self._getArticle(title, revision=revision) if article: result = article['content'] if isinstance(result, str): # fix bug in some simplejson version w/ Python 2.4 return unicode(result, 'utf-8') return result return None
def __init__(self, db, fn): self.fn = fn self.db = db self.docs = {} for block in unicode(open(fn, "rb").read(), 'utf-8').split(" "): if not block: continue title, txt = block.split("\n", 1) ns, partial, full = namespace.splitname(title) self.docs[(ns, partial)] = txt
def getTemplate(self, name, followRedirects=True): ns, name, full = namespace.splitname(name, namespace.NS_TEMPLATE) if ns!=namespace.NS_TEMPLATE: return self.getRawArticle(full) try: result = self.templates[name]['content'] if isinstance(result, str): # fix bug in some simplejson version w/ Python 2.4 return unicode(result, 'utf-8') return result except KeyError: pass return None
def getTemplate(self, name, followRedirects=True): """ Note: *Not* following redirects is unsupported! """ ns, name, full = namespace.splitname(name, namespace.NS_TEMPLATE) if ns!=namespace.NS_TEMPLATE: return self.getRawArticle(full) if name.replace('_', ' ').lower() in self.template_blacklist: log.info("ignoring blacklisted template:" , repr(name)) return None try: return self.template_cache[name] except KeyError: pass titles = [u'Template:%s' % name] if self.print_template_pattern: titles.insert(0, u'Template:%s' % (self.print_template_pattern.replace(u'$1', name),)) for title in titles: raw = self.getRawArticle(title) if raw is None: continue if self.template_exclusion_category: page = self.api_helper.page_query( titles=title, redirects=1, prop='categories', ) if page is None: log.warn('Could not get categories for template %r' % title) continue if 'categories' in page: categories = [ c.get('title', '').split(':', 1)[-1] for c in page['categories'] ] if self.template_exclusion_category in categories: log.info('Skipping excluded template %r' % title) continue self.template_cache[name] = raw return raw log.warn('Could not fetch template %r' % name) self.template_cache[name] = None return None
def getTemplate(self, name, followRedirects=False): ns, name, full = namespace.splitname(name, namespace.NS_TEMPLATE) if ns!=namespace.NS_TEMPLATE: return self.getRawArticle(full) try: return self.templates[name]['content'] except KeyError: pass r = self.db.getTemplate(name, followRedirects=followRedirects) self.templates[name] = { 'content-type': 'text/x-wiki', 'content': r, } return r
def getTemplatesForArticle(self, title, revision=None): """Return a dictionary with all templates used in article with given title and revision. """ kwargs = { 'generator': 'templates', 'gtllimit': 500, 'gtlnamespace': 10, 'prop': 'revisions', 'rvprop': 'content', } if revision is None: kwargs['titles'] = title else: kwargs['revids'] = revision result = self.api_helper.query(**kwargs) if not result: return None result = result['query'] if 'pages' not in result: return None title2raw = {} for oldid, info in result['pages'].items(): ns, name, full = namespace.splitname(info['title'], namespace.NS_TEMPLATE) if ns != namespace.NS_TEMPLATE: continue try: raw = info['revisions'][0]['*'] if self.redirect_rex.search(raw): raw = self.getTemplate(name) if raw: d = { 'content': raw, 'content-type': 'text/x-wiki', } title2raw[name] = d self.template_cache[name] = d except (KeyError, IndexError): continue return title2raw