def get_random_page_ids_for_lang(self, lang_code): # Call the API to get a random sample of page ids site = self.sites[lang_code] # API allows random page_ids to be retrieved max 500 at a time page_ids = [] params = { 'action': 'query', 'generator': 'random', 'grnnamespace': '0', 'grnlimit': 500, 'prop': 'info' } for i in range(0, int(self.pages_per_lang / 500)): request = api.APIRequest(site, params) result = request.query() page_ids.extend(result['query']['pages'].keys()) if self.pages_per_lang % 500 > 0: params['grnlimit'] = self.pages_per_lang % 500 request = api.APIRequest(site, params) result = request.query() page_ids.extend(result['query']['pages'].keys()) return page_ids
def find_outer_section(title, text, id): # Check if closing comment is in here, if not look for the outer section. # If there is an outer section, choose it only if it has a closing statement, if len(title) > 1: section_title = title[1].encode('ascii', 'ignore') params = { 'action': 'query', 'titles': title[0], 'prop': 'revisions', 'rvprop': 'content', 'format': 'json', 'redirects': 'yes' } result = api.APIRequest(site, params).query() whole_text = _clean_wiki_text( result['query']['pages'][id]['revisions'][0]['*']) import wikichatter as wc parsed_whole_text = wc.parse(whole_text.encode('ascii', 'ignore')) sections = parsed_whole_text['sections'] for outer_section in sections: found_subection = get_section(outer_section['subsections'], section_title) if found_subection: outer_comments = outer_section['comments'] for comment in outer_comments: comment_text = '\n'.join(comment['text_blocks']) if re.search(_CLOSE_COMMENT_RE, comment_text): params = { 'action': 'parse', 'prop': 'sections', 'page': title[0], 'redirects': 'yes' } result = api.APIRequest(site, params).query() for s in result['parse']['sections']: if s['line'] == outer_section.get( 'heading').strip(): section_index = s['index'] params = { 'action': 'query', 'titles': title[0], 'prop': 'revisions', 'rvprop': 'content', 'rvsection': section_index, 'format': 'json', 'redirects': 'yes' } result = api.APIRequest(site, params).query() final_section_text = result['query'][ 'pages'][id]['revisions'][0]['*'] return final_section_text return text
def fileHook(parser_env, namespace, body): (file_name, pipe, size) = body.partition('|') site = wiki.Wiki('https://en.wikipedia.org/w/api.php') params = { 'action': 'query', 'titles': 'File:' + file_name, 'prop': 'imageinfo', 'iiprop': 'url|thumbmime', 'iiurlwidth': size } request = api.APIRequest(site, params) result = request.query() try: url = result['query']['pages'].values()[0]['imageinfo'][0]['thumburl'] desc_url = result['query']['pages'].values( )[0]['imageinfo'][0]['descriptionurl'] width = result['query']['pages'].values( )[0]['imageinfo'][0]['thumbwidth'] height = result['query']['pages'].values( )[0]['imageinfo'][0]['thumbheight'] except: return file_name text = '<a href="%s" class="image">' % desc_url text += '<img alt="%s" src="%s" width="%s" height="%s"></a>' % ( file_name, url, width, height) return text
def parse_text(content): links = {} rough_links = re.findall(linkP, content) #~ if len(links) > 50: title_lists = list2params(rough_links) for title_list in title_lists: params = { 'action': 'query', 'titles': title_list, 'redirects': 1 } #, 'pllimit': 500, 'redirects':1} request = api.APIRequest(site, params) if very_verbose: print(' ' + 'query: ' + str(params)) result = request.query() for page in result['query']['pages']: if page != '-1' and 'ns' in result['query']['pages'][page]: if result['query']['pages'][page]['ns'] == 0: link = result['query']['pages'][page][ 'title'] #.replace(' ', '_') links[link] = 1 return links
def extlinks_extraction(self, lang, title): links = [] linklist = [] site = wiki.Wiki("https://" + lang + ".wikipedia.org/w/api.php") #urllib2.quote(title.encode("utf8")) #title = title.encode("utf-8") params = { 'action': 'query', 'titles': title, 'prop': 'extlinks', 'ellimit': 500 } req = api.APIRequest(site, params) for res in req.queryGen(): #pprint.pprint(res) for pidkey in res['query']['pages']: #print res['query']['pages'] if 'extlinks' in res['query']['pages'][pidkey]: linklist = res['query']['pages'][pidkey][ 'extlinks'] + linklist links = links + linklist linklist = [] # print links return links
def queries(): if args.queryfile: for l in open(args.queryfile): yield l.strip() elif args.query: yield args.query elif args.category and not HAS_WIKITOOLS: sys.exit( "-cat option given, but wikitools package is not present, see < https://github.com/alexz-enwp/wikitools >" ) elif args.category and HAS_WIKITOOLS: site = wiki.Wiki("https://commons.wikimedia.org/w/api.php") query = [] params = { 'action': 'query', 'prop': 'imageinfo', 'iiprop': 'url', 'generator': 'categorymembers', 'gcmtitle': 'Category:' + args.category, 'gcmnamespace': '6', 'gcmprop': 'title' } req = api.APIRequest(site, params) for data in req.queryGen(): keys = data['query']['pages'].keys() for key in keys: url = data['query']['pages'][key]['imageinfo'][0]['url'] yield re.sub("https://upload.wikimedia.org", "", url) else: sys.exit("No query given")
def query_page_ids_for_lang(self, query, lang_code): # Call the API to query a sample of page ids site = self.sites[lang_code] # API allows page_ids to be retrieved max 500 at a time page_ids = [] params = { 'action': 'query', 'generator': 'search', 'gsrwhat': 'text', 'gsrsearch': query, 'grnlimit': 10, 'prop': 'info' } request = api.APIRequest(site, params) result = request.query() if 'query' in result: if 'pages' in result['query']: page_ids = result['query']['pages'].keys() print('page_ids: ' + str(page_ids)) if page_ids is None: page_ids = [] return page_ids
def fetch_log(self, ignore_callback=False): query_params = { 'action': 'query', 'list': 'abuselog', 'aflprop': 'ids|user|title|action|result|filter' } request = api.APIRequest(self.wiki, query_params) try: result = request.query(querycontinue=False) except api.APIError as e: self.log.debug('Caught APIError in fetch_log: %s, isLoggedIn(%s)=%s', e, self.username, self.wiki.isLoggedIn(self.username)) return items = result['query']['abuselog'] items.reverse() if self.last_log_id: items = [item for item in items if item['id'] > self.last_log_id] if items: self.last_log_id = items[-1]['id'] # Exclude ignored filters items = [item for item in items if item['filter_id'] not in self.ignored_filters] if not ignore_callback: for item in items: self.callback(self.wiki_name, item)
def wikipedia_query(query_params): """ An extremely basic wrapper for the wikitools api. """ site = wiki.Wiki() # This defaults to en.wikipedia.org request = api.APIRequest(site, query_params) result = request.query() return result[query_params['action']]
def test_parseJSON_bad_data_nonwrite(self): self.site.apibase = "http://localhost/w/index.php" self.site.maxwaittime = 10 params = {} with self.assertWarns(UserWarning): with self.assertRaises(wikitools.exceptions.APIFailure): req = api.APIRequest(self.site, params) req.query(False)
def test_getRaw_HTTP_error_nonwrite(self): self.site.apibase = "http://httpbin.org/status/500" self.site.maxwaittime = 10 params = {} with self.assertWarns(UserWarning): with self.assertRaises(requests.exceptions.HTTPError): req = api.APIRequest(self.site, params) req.query(False)
def get_article(url, source_id, rfc_DB): cmd = 'select id, disqus_id, section_index, title from website_article where url = %s' article_result = rfc_DB.fetch_one(cmd, (urllib2.unquote(url), )) if article_result is not None: article_id, disqus_id, section_index, title = article_result return article_id, disqus_id, section_index, title else: if 'wikipedia.org/wiki/' in url: url_parts = url.split('/wiki/') wiki_sub = url_parts[1].split(':') wiki_parts = ':'.join(wiki_sub[1:]).split('#') wiki_page = wiki_parts[0] section = None if len(wiki_parts) > 1: section = wiki_parts[1] from wikitools import wiki, api site = wiki.Wiki(_DOMAIN + '/w/api.php') page = urllib2.unquote( str(wiki_sub[0]) + ':' + wiki_page.encode('ascii', 'ignore')) params = { 'action': 'parse', 'prop': 'sections', 'page': page, 'redirects': 'yes' } from wikitools import wiki, api try: request = api.APIRequest(site, params) result = request.query() disqus_id = str(result['parse']['pageid']) section_title = None section_index = None if section: for s in result['parse']['sections']: if s['anchor'] == section: disqus_id = str(disqus_id) + '#' + str(s['index']) section_title = s['line'] section_index = s['index'] title = result['parse']['title'] if section_title is not None: title = title + ' - ' + section_title link = urllib2.unquote(url) article_insert_command = " insert into website_article (disqus_id, title, url, source_id, section_index)\ values (%s, %s, %s, %s, %s)" article_id = rfc_DB.insert( article_insert_command, (disqus_id, title, link, source_id, section_index)) return article_id, disqus_id, section_index, title except api.APIError as e: print e
def import_wiki_authors(authors, rfc_DB): found_authors = set() anonymous_exist = False for author in authors: if author: found_authors.add(author) else: anonymous_exist = True authors_list = '|'.join(found_authors) from wikitools import wiki, api site = wiki.Wiki(_DOMAIN + '/w/api.php') params = { 'action': 'query', 'list': 'users', 'ususers': authors_list, 'usprop': 'blockinfo|groups|editcount|registration|emailable|gender', 'format': 'json' } request = api.APIRequest(site, params) result = request.query() comment_authors = [] for user in result['query']['users']: comment_author_id = None try: author_id = user['userid'] # first check if the author exists using the username command = "select id from website_commentauthor where username = %s" (comment_author_id, ) = rfc_DB.fetch_one(command, (user['name'], )) # if no author exists with the same username if comment_author_id is None: author_insert_command = " insert into website_commentauthor (username, disqus_id, joined_at, edit_count, gender, groups, is_wikipedia)\ values (%s, %s, %s, %s, %s, %s, %s)" joined_at = datetime.datetime.strptime(user['registration'], '%Y-%m-%dT%H:%M:%SZ') params = (user['name'], author_id, joined_at, user['editcount'], user['gender'], ','.join(user['groups']), 1) comment_author_id = rfc_DB.insert(author_insert_command, params) except Exception: command = " insert into website_commentauthor (username, is_wikipedia)\ values (%s, %s)" comment_author_id = rfc_DB.insert(command, (user['name'], 1)) if comment_author_id is not None: comment_authors.append(comment_author_id) if anonymous_exist: anonymous_id = rfc_DB.get_anonymous_id() comment_authors.append(anonymous_id) return comment_authors
def wiki_get_edittoken(w_site): params = {'action': 'tokens'} req = api.APIRequest(w_site, params) res = req.query(querycontinue=False) w_site_edittoken = res['tokens']['edittoken'] print('token: ' + w_site_edittoken) return w_site_edittoken
def getlist(wikia, wkfrom=1, wkto=1000): params = { 'action': 'query', 'list': 'wkdomains', 'wkactive': '1', 'wkfrom': wkfrom, 'wkto': wkto, } request = api.APIRequest(wikia, params) return request.query()['query']['wkdomains']
def test_listFromQuery(self): params = {"action": "query", "list": "allpages"} req = api.APIRequest(self.site, params) res = req.query(False) api.logging = True pages = pagelist.listFromQuery(self.site, res["query"]["allpages"]) for item in pages: self.assertIsInstance(item, page.Page) self.assertTrue(item.exists) self.assertEqual(len(api.querylog), 0)
def getBacklinks(self): ''' Find all pages that link to this page limit - maximum number of pagenames to return ''' params = { 'action': 'query', 'list': 'backlinks', 'bltitle': self.title, } apiresponse = api.APIRequest(self.site, params).query() return map(lambda x: x['title'], apiresponse['query']['backlinks'])
def test_getRaw_HTTP_error_write(self): self.site.apibase = "http://httpbin.org/status/500" params = {} warnings.filterwarnings("error", category=UserWarning, module="wikitools.api") with self.assertRaises(requests.exceptions.HTTPError): req = api.APIRequest(self.site, params, write=True) req.query(False) warnings.filterwarnings("default", category=UserWarning, module="wikitools.api")
def test_parseJSON_bad_data_write(self): self.site.apibase = "http://localhost/w/index.php" params = {} warnings.filterwarnings("error", category=UserWarning, module="wikitools.api") with self.assertRaises(wikitools.exceptions.APIFailure): req = api.APIRequest(self.site, params, write=True) req.query(False) warnings.filterwarnings("default", category=UserWarning, module="wikitools.api")
def import_wiki_authors(authors, article): found_authors = [] anonymous_exist = False for author in authors: if author: found_authors.append(author) else: anonymous_exist = True authors_list = '|'.join(found_authors) from wikitools import wiki, api domain = article.url.split('/wiki/')[0] site = wiki.Wiki(domain + '/w/api.php') params = { 'action': 'query', 'list': 'users', 'ususers': authors_list, 'usprop': 'blockinfo|groups|editcount|registration|emailable|gender', 'format': 'json' } request = api.APIRequest(site, params) result = request.query() comment_authors = [] for user in result['query']['users']: try: author_id = user['userid'] comment_author = CommentAuthor.objects.filter(disqus_id=author_id) if comment_author.count() > 0: comment_author = comment_author[0] else: joined_at = datetime.datetime.strptime(user['registration'], '%Y-%m-%dT%H:%M:%SZ') comment_author = CommentAuthor.objects.create( username=user['name'], disqus_id=author_id, joined_at=user['registration'], edit_count=user['editcount'], gender=user['gender'], groups=','.join(user['groups']), is_wikipedia=True) except Exception: comment_author = CommentAuthor.objects.create( username=user['name'], is_wikipedia=True) comment_authors.append(comment_author) if anonymous_exist: comment_authors.append( CommentAuthor.objects.get(disqus_id='anonymous', is_wikipedia=True)) return comment_authors
def crawl(url_param): # Fix eventual full URL url_param = unquote_plus(basename(url_param)) # Generate query params = { 'action' : 'query', 'prop' : 'imageinfo|revisions', 'iiprop' : 'url|sha1|size', 'rvprop' : 'content', 'rawcontinue' : '' } url_type = get_url_type(url_param) if url_type == 'category': params['generator'] = 'categorymembers' params['gcmtitle'] = url_param params['gcmlimit'] = 'max' elif url_type == 'file': params['titles'] = url_param else: params['generator'] = 'images' params['titles'] = url_param params['gimlimit'] = 'max' # Call API site = wiki.Wiki(API_URL) request = api.APIRequest(site, params) print_verbose("Site: %s" % str(site), 2) print_verbose("Query: ", 2) pprint_verbose(params, 2) result = request.query(querycontinue=True) print_verbose("Result: ", 4) pprint_verbose(result, 4) # Check result if 'error' in result: raise Error(result['error']) if 'warnings' in result: sys.stderr.write(result['warnings']) return None if '-1' in result['query']['pages']: sys.stderr.write(result['query']['pages']['-1']) return None return result['query']['pages']
def test_parseJSON_maxlag(self): site = wiki.Wiki("https://en.wikipedia.org/w/api.php") params = {"action": "query"} req = api.APIRequest(site, params) req.changeParam("maxlag", "-1") warnings.filterwarnings("error", category=UserWarning, module="wikitools.api") with self.assertRaises(UserWarning): req.query(False) warnings.filterwarnings("default", category=UserWarning, module="wikitools.api")
def toolbar_icon_clicked(self, widget, movie): import pprint # Used for formatting the output for viewing, not necessary for most code from wikitools import wiki, api site = wiki.Wiki("http://de.wikipedia.org/w/api.php") params = {'action':'query', 'list':'search', 'srsearch':'rocky', 'srprop':'', 'srlimit':'50' } req = api.APIRequest(site, params) res = req.query(querycontinue=False) pprint.pprint(res)
def get_usercontribs(lang, site, params, stop_date): user_contrib = [] req = api.APIRequest(site, params) for resp in req.queryGen(): pages = resp['query']['usercontribs'] before_end = [ p for p in pages if datetime.strptime(p['timestamp'], "%Y-%m-%dT%H:%M:%SZ") > stop_date ] user_contrib.extend(before_end) if len(pages) != len(before_end): # Has hit stop date if some pages are filtered away break return lang, user_contrib
def get_linked_here(self, page): args = {'action': 'query', 'prop': 'linkshere', 'titles': page} logger.debug("processing Links Here list") request = api.APIRequest(self.site, args) results = request.queryGen() titles = [] for r in results: pages = r['query']['pages'] key, values = pages.popitem() for value in values['linkshere']: # logger.debug(value) titles.append(value['title']) self.logger.info("Returning {:d} titles".format(len(titles))) return titles
def search_disambiguation_page(self, title): search_title = title + self.search_disambig args = { 'action': 'query', 'list': 'search', 'srsearch': search_title, 'srprop': 'size|wordcount' } request = api.APIRequest(self.site, args) results = request.queryGen() titles = [] for r in results: for values in r['query']['search']: titles.append(values['title']) return titles
def wiki_page_set_cnt(w_site, w_site_edittoken, p_title, p_content, p_summary): print("Upload ...") params = { 'action': 'edit', 'title': p_title, 'summary': p_summary, 'text': p_content, 'token': w_site_edittoken } req = api.APIRequest(w_site, params) res = req.query(querycontinue=False) print("... done.") pprint.pprint(res)
def get_cords(titles): """Get coordinates for titles via mediawiki API.""" par = {'action': 'query', 'titles': titles, 'prop': 'coordinates', 'colimit': 500, 'format': 'json'} p_req = api.APIRequest(site, par) for resp in p_req.queryGen(): for t, page in resp['query']['pages'].items(): if 'coordinates' in page: print(f"{page['title']} --- N {page['coordinates'][0]['lat']}" f", E {page['coordinates'][0]['lon']}") yield [page['title'], page['coordinates'][0]['lat'], page['coordinates'][0]['lon']] else: print(f"{page['title']} --- missing cords: {titles}") yield [page['title'], '', '']
def getChanges(self): print('getChanges from:',self.dataTS) with wiki.Wiki(url = self.siteurl) as mywiki: try: #Get recent edits and new titles. #print('get') urldata = { 'action':'query', 'list':'recentchanges', 'rcprop':'title|timestamp|loginfo', 'rctype':'log|edit|new', 'rcdir':'newer', 'rcstart':self.dataTS, #'continue':self.dummycontinue, 'rclimit':MAX_LE_OR_RC, 'maxlag':MAX_LAG } therequest = api.APIRequest(mywiki, urldata) #thedict = {} #print(therequest) for req in therequest.queryGen(): #if self.fnrefresh: #myprint('got some data', str(len(self.changes))) #self.fnrefresh('rcpages:' + str(len(self.changes))) #print(req) for title in req['query']['recentchanges']: #print('a query') if title['type'] == 'new' or title['type'] == 'edit': self.addifnewer (title['title'], title['timestamp']) if title['type'] == 'log': if title['logaction'] == 'delete': self.addifnewer (title['title'], title['timestamp'], True) if title['logaction'] == 'move': self.addifnewer (title['logparams']['target_title'], title['timestamp']) if 'suppressredirect' in title['logparams']: self.addifnewer (title['logparams']['target_title'], title['timestamp'], True) else: self.addifnewer (title['title'], title['timestamp']) myprintstay('rcpages:', str(len(self.changes))) self.jsondump() #print (thedict) #for b in sorted(self.changes, key=lambda x: thedict[x]['timestamp']): #print( thedict[b]['timestamp'], b) except Exception as e: print(e) raise e print('Exiting getChanges') return True
def loadPages(site, pages, outDir, wait=3): count = 0 for page in pages: print >> sys.stderr, "Loading page", page, "(" + str(count+1) +"/"+str(len(pages)) + ")" # define the params for the query params = {'action':'query', 'titles':page, 'export':None} # create the request object request = api.APIRequest(site, params) # query the API result = request.query() print >> sys.stderr, "Writing result" f = codecs.open(os.path.join(outDir, page+".xml"), "wt", "utf-8") f.write( result["query"]["export"]["*"] ) f.close() print >> sys.stderr, "Sleeping" time.sleep(wait) count += 1