def get_table_rows(self, conn): templates_in_cat = set() params = { 'action': 'query', 'list': 'categorymembers', 'cmtitle': 'Category:Ticker symbol templates', 'cmnamespace': 10, 'cmlimit': 'max', 'format': 'json' } request = wikitools.APIRequest(self.wiki, params) response = request.query(querycontinue=True) members = response['query']['categorymembers'] for member in members: templates_in_cat.add(member[u'title'].split(':', 1)[1]) template_variations = templates_in_cat for template in templates_in_cat: template_redirects = self.get_template_redirects(template) template_variations = template_variations.union(template_redirects) page_texts = {} for template in templates_in_cat: params = { 'action': 'query', 'generator': 'embeddedin', 'geititle': 'Template:%s' % template, 'geinamespace': 0, 'geilimit': 'max', 'prop': 'revisions', 'rvprop': 'content', 'rvsection': 0, 'format': 'json' } request = wikitools.APIRequest(self.wiki, params) response = request.query(querycontinue=True) try: pages = response['query']['pages'] except KeyError: # This means no transclusions continue for page_id, page_data in pages.iteritems(): page_title = page_data['title'] page_text = page_data['revisions'][0]['*'] page_texts[page_title] = page_text i = 1 ticker_templates_re = re.compile( r"\{\{(%s)\|" % '|'.join(template_variations), re.I) ticker_templates_in_lead_re = re.compile( r"'''.+\{\{(%s)\|" % '|'.join(template_variations), re.I) for title, text in page_texts.iteritems(): if i > 1000: break instances = len(ticker_templates_re.findall(text)) if ticker_templates_in_lead_re.search(text): yield [u'{{dbr link|1=%s}}' % title, str(instances)] i += 1
def getCatMembers(self): """ Get the members of the specified category and their metadata. Example: http://meta.wikimedia.org/w/api.php?action=query&list=categorymembers&cmtype=page&cmtitle=Category:IEG/Proposals/IdeaLab&cmnamespace=200&cmprop=title|timestamp|ids&cmsort=timestamp&cmdir=desc&format=jsonfm ...will return a dict like {'page id' : someid, 'page path' : 'somepath', 'datetime added' : 'sometimestamp'} """ if self.mem_type == 'page': query_params = { 'action': 'query', 'list': 'categorymembers', 'cmtitle': self.cat_title, 'cmtype': self.mem_type, 'cmnamespace': self.mem_namespace, 'cmprop': 'title|timestamp|ids', 'cmsort': 'timestamp', 'cmdir': 'desc', 'rawcontinue': '1', } req = wikitools.APIRequest(self.wiki, query_params) response = req.query() mem_list = [{ 'page id': str(x['pageid']), 'page path': x['title'], 'timestamp': x['timestamp'] } for x in response['query']['categorymembers']] for mem in mem_list: mem = self.getPageMetaData(mem) return mem_list else: print "not set up to get " + self.mem_type + " category members yet"
def last_log_entry(page): params = { 'action': 'query', 'list': 'logevents', 'lelimit': '1', 'letitle': page, 'format': 'json', 'ledir': 'older', 'letype': 'protect', 'leprop': 'user|timestamp|comment' } request = wikitools.APIRequest(wiki, params) response = request.query(querycontinue=False) lastlog = response['query']['logevents'] try: timestamp = datetime.datetime.strptime( lastlog[0]['timestamp'], '%Y-%m-%dT%H:%M:%SZ').strftime('%Y%m%d%H%M%S') except: timestamp = '' try: user = lastlog[0]['user'] except: user = '' try: comment = lastlog[0]['comment'] except: comment = '' return {'timestamp': timestamp, 'user': user, 'comment': comment}
def _resolve_redirects_to_templates(self, templates): templates = set(templates) if self._wikipedia is None: # Testing return templates params = { 'action': 'query', 'format': 'json', 'prop': 'redirects', 'titles': '|'.join( # The API resolves Template: to the relevant per-language prefix 'Template:' + tplname for tplname in self._cfg.citation_needed_templates), 'rnamespace': 10, } request = wikitools.APIRequest(self._wikipedia, params) # We could fall back to just using self._cfg.citation_needed_templates # if the API request fails, but for now let's just crash for result in request.queryGen(): for page in result['query']['pages'].values(): for redirect in page.get('redirects', []): # TODO We technically only need to keep the templates that # mwparserfromhell will consider different from one another # (e.g., no need to have both Cn and CN) if ':' not in redirect['title']: # Not a template? continue tplname = redirect['title'].split(':', 1)[1] templates.add(tplname) return templates
def _to_html(self, snippet): if self._wikipedia is None: # Testing return snippet params = { 'action': 'parse', 'format': 'json', 'text': snippet, } request = wikitools.APIRequest(self._wikipedia, params) # FIXME Sometimes the request fails because the text is too long; # in that case, the API response is HTML, not JSON, which raises # an exception when wikitools tries to parse it. # # Normally this would cause wikitools to happily retry forever # (https://github.com/alexz-enwp/wikitools/blob/b71481796c350/wikitools/api.py#L304), # which is a bug, but due to our use of a custom opener, wikitools' # handling of the exception raises its own exception: the object returned # by our opener doesnt support seek(). # # We use that interesting coincidence to catch the exception and move # on, bypassing wikitools' faulty retry, but this is obviously a terrible # "solution". try: html = request.query()['parse']['text']['*'] except: return '' return self._cleanup_snippet_html(html)
def get_template_redirects(self, template_title): template_redirects = set() params = { 'action': 'query', 'list': 'backlinks', 'bltitle': 'Template:%s' % template_title, 'blnamespace': 10, 'blfilterredir': 'redirects', 'format': 'json' } request = wikitools.APIRequest(self.wiki, params) response = request.query(querycontinue=True) backlinks = response['query']['backlinks'] for backlink in backlinks: template_redirects.add(backlink[u'title'].split(':', 1)[1]) return template_redirects
def query_pageids(wiki, pageids): params = { 'action': 'query', 'pageids': '|'.join(map(str, pageids)), 'prop': 'revisions', 'rvprop': 'content' } request = wikitools.APIRequest(wiki, params) for response in request.queryGen(): for id, page in response['query']['pages'].items(): if 'title' not in page: continue title = d(page['title']) text = page['revisions'][0]['*'] if not text: continue text = d(text) yield (id, title, text)
def queryApi(self, apiurl, query): """ This function queries the API by running query on apiurl and outputs the result in JSON format. - apiurl (string): The URL to the API's base. - query (dict): A dictionary of API parameters. Returns: Dict with the API results. TODO: The API query should be reimplemented here so that we do not have the wikitools library requirement. """ Wiki = wikitools.Wiki(apiurl) if type(query) != dict: raise TypeError('Query parameter should be type dict' ', got %s instead' % (type(query))) else: API = wikitools.APIRequest(Wiki, query) return API.query(querycontinue=False)
def getPageMetaData(self, mempage): #Need to make this a call to profiles.py. """ Gets some additional metadata about each page. Currently just the local talkpage id or subjectid and the full url. """ params = { 'action': 'query', 'titles': mempage['page path'], 'prop': 'info', 'inprop': 'talkid|subjectid|url', 'rawcontinue': '1', } req = wikitools.APIRequest(self.wiki, params) response = req.query() pageid = str(mempage['page id']) try: mempage['talkpage id'] = str( response['query']['pages'][pageid]['talkid']) except KeyError: mempage[ 'talkpage id'] = "" #probably not necessary anymore, if I add these default params in to every one anyway. return mempage
wikidataCodes[x:x + chunkSize] for x in xrange(0, len(wikidataCodes), chunkSize) ] #Fetch data for c in chunks: print("Getting %d items from Wikidata..." % len(c)) cQueryString = '|'.join(c) params = { 'action': 'wbgetentities', 'languages': languagesQueryString, 'props': 'labels|claims', 'ids': cQueryString, 'format': 'json' } request = wikitools.APIRequest(site, params) result = request.query() if "entities" in result: for qid, e in result["entities"].items(): nation = {} # Look for translations (labels) nation["n"] = getLabelsFromEntity(e) #Lookfor properties if "claims" in e: # Look for entity properties # These will be pointers to other WD enteties, from which we will get the names later for pkey, (pid, abbr) in settings.entityProperties.items():
i = 1 output = [] params = { 'action': 'query', 'generator': 'random', 'grnnamespace': 0, 'grnlimit': 'max', 'prop': 'revisions', 'rvprop': 'content', 'format': 'json' } while True: if i > 1000: break request = wikitools.APIRequest(wiki, params) response = request.query(querycontinue=False) pages = response['query']['pages'] for page_id, page_data in pages.iteritems(): if i > 1000: break page_title = page_data['title'] page_text = page_data['revisions'][0]['*'] if not re.search(r'(\[\[(file:|image:)|\.jpg|\.png|\.gif)', page_text, re.I): table_row = u"""\ |- | %d | {{dbr link|1=%s}}""" % (i, page_title) output.append(table_row) i += 1