def UpdateWiki(self): """ Write the contents of the teams dictionary back into the wiki """ wiki = MediaWiki(self.config.get('PlanetKubb', 'API')) wiki.login(self.config.get('KubbBot', 'Username'), self.config.get('KubbBot', 'Password')) # We need an edit token c = wiki.call({'action': 'query', 'titles': 'Foo', 'prop': 'info', 'intoken': 'edit'}) print c my_token = c['query']['pages']['-1']['edittoken'] print "Edit token: %s" % my_token print "== Updating wiki with new scores ==" for team in self.teams: print "\"%s\",%f,%f" % (team, self.teams[team].mu, self.teams[team].sigma) c = wiki.call({ 'action': 'sfautoedit', 'form': 'Team', 'target': team, 'Team[TrueSkill mu]': "%s" % self.teams[team].mu, 'Team[TrueSkill sigma]': "%s" % self.teams[team].sigma, 'token': my_token}) print c
def open_connection(bot_name, env_name, api_url): """Open a connection to MediaWiki for a bot.""" LOGGER.info("Opening MediaWiki connection for %s at %s", bot_name, api_url) apiary_wiki = MediaWiki(api_url) edit_token = None try: # Passwords may be defined in the environment or in the config file # We prefer the environment variable if it is present password = os.environ.get(env_name, None) if password is None: try: config.get('Passwords', bot_name) except Exception as e: LOGGER.warn('No configuration file detected.') if password is not None: LOGGER.info("Logging in as %s using %s", bot_name, password) apiary_wiki.login(bot_name, password) LOGGER.info("Getting edit token for %s", bot_name) wiki_return = apiary_wiki.call({ 'action': 'tokens', 'type': 'edit' }) edit_token = wiki_return['tokens']['edittoken'] LOGGER.info("%s has been given edit token %s", bot_name, edit_token) else: LOGGER.warn("No password was provided for %s. Queries allowed but editing will not work.", bot_name) except Exception as e: raise Exception("Unable to login as %s got '%s'", bot_name, e) return (apiary_wiki, edit_token)
def open_connection(bot_name, env_name): """Open a connection to MediaWiki for a bot.""" LOGGER.info("Opening MediaWiki connection for %s at %s", bot_name, API_URL) apiary_wiki = MediaWiki(API_URL) try: # Passwords may be defined in the environment or in the config file # We prefer the environment variable if it is present password = os.environ.get(env_name, None) if password is None: try: config.get('Passwords', bot_name) except Exception, e: LOGGER.warn('No configuration file detected.') LOGGER.info("Logging in as %s using %s", bot_name, password) apiary_wiki.login(bot_name, password) LOGGER.info("Getting edit token for %s", bot_name) wiki_return = apiary_wiki.call({ 'action': 'tokens', 'type': 'edit' }) edit_token = wiki_return['tokens']['edittoken'] LOGGER.info("%s has been given edit token %s", bot_name, edit_token)
def get_lab_text(lab_slug, language): """Gets text description in English or Italian from a single lab from makeinitaly.foundation.""" if language == "English" or language == "english" or language == "EN" or language == "En": language = "en" elif language == "Italian" or language == "italian" or language == "IT" or language == "It" or language == "it": language = "it" else: language = "en" wiki = MediaWiki(makeinitaly__foundation_api_url) wiki_response = wiki.call( {'action': 'query', 'titles': lab_slug + "/" + language, 'prop': 'revisions', 'rvprop': 'content'}) # If we don't know the pageid... for i in wiki_response["query"]["pages"]: if "revisions" in wiki_response["query"]["pages"][i]: content = wiki_response["query"]["pages"][i]["revisions"][0]["*"] else: content = "" # Clean the resulting string/list newstr01 = content.replace("}}", "") newstr02 = newstr01.replace("{{", "") result = newstr02.rstrip("\n|").split("\n|") return result[0]
def setup(config): try: wikiConn = MediaWiki("http://%s:8888/mediawiki/api.php" % mwserver, user_agent="IA-mwbridge") wikiConn.login("david", "bad pass") token = wikiConn.call({"action": "query", "meta": "tokens"})["query"]["tokens"]["csrftoken"] except Exception as e: print "Trouble connecting to mediawiki" + e
def handle(text, mic, profile): baseurl= "http://www.wikihow.com/" wiki = MediaWiki('http://www.wikihow.com/api.php') #wiki.login("*****@*****.**", "david1234") params = {'action':'query','list':'search','srsearch':text,'srprop':'redirecttitle','limit':'1', 'format':'json'} response = wiki.call(params) #r = json.dumps(response, sort_keys=True, indent=4, separators=(',', ': ')) flag = 0 flag_title = "none" pos= response['query']['search'] query = getRequest(text) wiki.logout() #Getting the article with the best score for key in pos: val = fuzz.ratio(key['title'],query) print(str(val) + "% " + key['title']) if val > flag: flag = val flag_title = key['title'] if flag !=0: answer = flag_title mic.say(answer) #rWH = renderWH.renderWikihow() #url = baseurl + answer #print url #url_ = rWH.getContent(str(url)) #rWH.renderContent(url_) webbrowser.open(baseurl + flag_title) else: mic.say("I could not find anything bro!")
def connectwiki(self, bot_name): self.apiary_wiki = MediaWiki(self.config.get('WikiApiary', 'API')) c = self.apiary_wiki.login(self.config.get(bot_name, 'Username'), self.config.get(bot_name, 'Password')) if self.args.verbose >= 1: print("Username: %s Password: %s" % (self.config.get( bot_name, 'Username'), self.config.get(bot_name, 'Password'))) print(c)
def fetch_article_in_wikitext(articleTitle): #fetches the article data using the simplemediawiki lib. import codecs wiki = MediaWiki('http://en.wikipedia.org/w/api.php') wikiTextPage = wiki.call({'action':'parse', 'page': articleTitle, 'prop': 'wikitext'}); wikiTextPage = wikiTextPage['parse']['wikitext']['*'] codecs.open("FetchFunctionPulls/fetchWikiTextPage",'w', encoding='utf-8').write(wikiTextPage) htmlArticle = wiki.call({ 'action':'parse','page':articleTitle, 'prop': 'text'}); htmlArticle = htmlArticle['parse']['text']['*'] codecs.open("FetchFunctionPulls/fetchHTMLPage",'w', encoding='utf-8').write(htmlArticle) #print type(htmlArticle) return wikiTextPage, htmlArticle
def __init__(self, url, username, password): self.wiki = MediaWiki(url) self.username = username self.password = password self.login = self._make_wiki_login_call({'action': 'login'}) self.token = self._make_wiki_login_call({ 'action': 'login', 'lgtoken': self.login['login']['token'] })
def fetch(cls, url, use_cache=True): m = re.match(r'^http://([a-z\-]+)\.wikipedia\.org', url) page_lang = m.group(1).encode('utf8') page_title = extract_page_title(url, page_lang) wp = MediaWiki('http://%s.wikipedia.org/w/api.php' % page_lang) resp = wp.call({'action': 'query', 'prop': 'pageprops|revisions', 'titles': page_title.encode('utf8'), 'rvprop': 'content'}) page = resp['query']['pages'].values()[0] content = page['revisions'][0].values()[0] if 'revisions' in page else None if 'pageprops' in page and 'wikibase_item' in page['pageprops']: wikidata_id = page['pageprops']['wikibase_item'] else: wikidata_id = None return cls(page_title, content or '', page_lang, wikidata_id)
def parseWiki(con): useragent = build_user_agent('l2wiki', 0.1, 'https://github.com/tm-calculate/l2wiki') wiki = MediaWiki('http://l2central.info/c/api.php', user_agent=useragent) cmcontinue = parseWikiPart(con, wiki) while cmcontinue: cmcontinue = parseWikiPart(con, wiki, cmcontinue)
def __init__(self): config = ConfigParser.ConfigParser() config.read('../apiary.cfg') self.wikiapiary = MediaWiki(config.get('WikiApiary', 'api')) self.wikiapiary.login(config.get('TropicalBot', 'Username'), config.get('TropicalBot', 'Password')) # We need an edit token on wiki2 c = self.wikiapiary.call({ 'action': 'query', 'titles': 'Foo', 'prop': 'info', 'intoken': 'edit' }) self.my_token = c['query']['pages']['-1']['edittoken']
def __init__(self, url, username, password): self.wiki = MediaWiki(url) self.username = username self.password = password self.login = self._make_wiki_login_call({'action': 'login'}) self.token = self._make_wiki_login_call( {'action': 'login', 'lgtoken': self.login['login']['token']})
def fetch(cls, url, use_cache=True): m = re.match(r'^http://([a-z]{2})\.wikipedia\.org', url) page_lang = m.group(1).encode('utf8') page_title = extract_page_title(url, page_lang) wp = MediaWiki('http://%s.wikipedia.org/w/api.php' % page_lang) return cls( page_title, get_page_content(wp, page_title, page_lang, use_cache) or '', page_lang)
def fetch(cls, url, use_cache=True): m = re.match(r'^https?://([a-z\-]+)\.wikipedia\.org/wiki/(.*)$', url) page_lang = m.group(1).encode('utf8') page_title = urllib.unquote(m.group(2).encode('utf8')).decode('utf8') wp = MediaWiki('https://%s.wikipedia.org/w/api.php' % page_lang) resp = wp.call({ 'action': 'query', 'prop': 'pageprops|revisions', 'titles': page_title.encode('utf8'), 'rvprop': 'content' }) page = resp['query']['pages'].values()[0] content = page['revisions'][0].values( )[0] if 'revisions' in page else None if 'pageprops' in page and 'wikibase_item' in page['pageprops']: wikidata_id = page['pageprops']['wikibase_item'] else: wikidata_id = None return cls(page_title, content or '', page_lang, wikidata_id)
def __init__(self): config = ConfigParser.ConfigParser() config.read('../apiary.cfg') # Connect to SMW Community Wiki self.smwreferata = MediaWiki('http://smw.referata.com/w/api.php') # Connect to WikiApiary self.wikiapiary = MediaWiki(config.get('WikiApiary', 'api')) self.wikiapiary.login(config.get('wikkiibot', 'Username'), config.get('wikkiibot', 'Password')) # We need an edit token c = self.wikiapiary.call({ 'action': 'query', 'titles': 'Foo', 'prop': 'info', 'intoken': 'edit' }) self.my_token = c['query']['pages']['-1']['edittoken']
def handle(text, mic, profile): baseurl = "http://www.wikihow.com/" wiki = MediaWiki('http://www.wikihow.com/api.php') #wiki.login("*****@*****.**", "david1234") params = { 'action': 'query', 'list': 'search', 'srsearch': text, 'srprop': 'redirecttitle', 'limit': '1', 'format': 'json' } response = wiki.call(params) #r = json.dumps(response, sort_keys=True, indent=4, separators=(',', ': ')) flag = 0 flag_title = "none" pos = response['query']['search'] query = getRequest(text) wiki.logout() #Getting the article with the best score for key in pos: val = fuzz.ratio(key['title'], query) print(str(val) + "% " + key['title']) if val > flag: flag = val flag_title = key['title'] if flag != 0: answer = flag_title mic.say(answer) #rWH = renderWH.renderWikihow() #url = baseurl + answer #print url #url_ = rWH.getContent(str(url)) #rWH.renderContent(url_) webbrowser.open(baseurl + flag_title) else: mic.say("I could not find anything bro!")
def get_segment(segment_id): '''Get a specific segment from wikiapiary''' LOGGER.info("Connecting to %s", APIARY_URL) apiary_wiki = MediaWiki(APIARY_URL) print ("Retrieving segment", segment_id) my_query = ''.join([ '[[Category:Website]]', '[[Is defunct::False]]', '[[Is active::True]]', "[[Has bot segment::%d]]" % segment_id, '|?Has API URL', '|?Has statistics URL', '|?Check every', '|?Creation date', '|?Page ID', '|?Collect general data', '|?Collect extension data', '|?Collect skin data', '|?Collect statistics', '|?Collect semantic statistics', '|?Collect logs', '|?Collect recent changes', '|?Collect statistics stats', '|sort=Creation date', '|order=asc', '|limit=1000']) sites = apiary_wiki.call({ 'action': 'ask', 'query': my_query }) if len(sites['query']['results']) > 0: for pagename, site in sites['query']['results'].items(): print ("Processing", pagename) else: LOGGER.error("No records returned.")
def get_labs(data_format): """Gets data from all labs from hackerspaces.org.""" labs = [] # Get the first page of data wiki = MediaWiki(hackerspaces_org_api_url) wiki_response = wiki.call({ 'action': 'query', 'list': 'categorymembers', 'cmtitle': 'Category:Hackerspace', 'cmlimit': '500' }) nextpage = wiki_response["query-continue"]["categorymembers"]["cmcontinue"] urls = [] for i in wiki_response["query"]["categorymembers"]: urls.append(i["title"].replace(" ", "_")) # Load all the Labs in the first page for i in urls: current_lab = get_single_lab(i, data_format) labs.append(current_lab) # Load all the Labs from the other pages while "query-continue" in wiki_response: wiki = MediaWiki(hackerspaces_org_api_url) wiki_response = wiki.call({ 'action': 'query', 'list': 'categorymembers', 'cmtitle': 'Category:Hackerspace', 'cmlimit': '500', "cmcontinue": nextpage }) urls = [] for i in wiki_response["query"]["categorymembers"]: urls.append(i["title"].replace(" ", "_")) # Load all the Labs for i in urls: current_lab = get_single_lab(i, data_format) labs.append(current_lab) if "query-continue" in wiki_response: nextpage = wiki_response["query-continue"]["categorymembers"][ "cmcontinue"] else: break # Transform the list into a dictionary labs_dict = {} for j, k in enumerate(labs): labs_dict[j] = k return labs_dict
def __init__(self): config = ConfigParser.ConfigParser() config.read('../apiary.cfg') self.wikiapiary = MediaWiki(config.get('WikiApiary', 'api')) self.wikiapiary.login(config.get('wmbot', 'Username'), config.get('wmbot', 'Password')) # We need an edit token on wiki2 c = self.wikiapiary.call({ 'action': 'query', 'titles': 'Foo', 'prop': 'info', 'intoken': 'edit' }) self.my_token = c['query']['pages']['-1']['edittoken']
def finger(request): change_list = [] foundtext = False # PUT IN CONNECTION VALUES HERE wiki = MediaWiki('set_your_api_url') wiki.login('set_username', 'set_password') tokendoc = wiki.call({'action': 'tokens'}) edittoken = tokendoc.get("tokens").get('edittoken') foundtext = False searchtext = request.GET['text'].strip() searchpage = request.GET['page'].strip() if searchtext == '' or searchpage == '': context = { 'message': 'Missing either search text or page to search!', } return render(request, 'blame/error.html', context) queryresult = wiki.call({'action': 'query', 'prop': 'revisions', 'rvprop': 'ids|user', 'rvdir': 'newer', 'rvlimit': '5000', 'format': 'jsonfm', 'titles': searchpage}) #print(str(queryresult)) if ('-1' in list(queryresult['query']['pages'].keys())): context = { 'message': 'The page you requested was not found! Please check your capitalization, namespace, and spelling!', } return render(request, 'blame/error.html', context) revisions = (list(queryresult['query']['pages'][list(queryresult['query']['pages'].keys())[0]]['revisions'])) for revision in revisions: revisiondata = wiki.call({'action': 'query', 'prop': 'revisions', 'revids': revision['revid'], 'rvprop': 'content', 'format': 'jsonfm'}) revisiontext = revisiondata['query']['pages'][list(queryresult['query']['pages'].keys())[0]]['revisions'][0]['*'] if not foundtext and searchtext in revisiontext: # PUT IN URL VALUE HERE change_list.append({'changetype': 'Added', 'revision': revision['revid'], 'user': revision['user'], 'link': 'set_your_website_url?title=' + searchpage + '&oldid=' + str(revision['revid'])}) foundtext = True elif foundtext and not searchtext in revisiontext: # PUT IN URL VALUE HERE change_list.append({'changetype': 'Removed', 'revision': revision['revid'], 'user': revision['user'], 'link': 'set_your_website_url?title=' + searchpage + '&oldid=' + str(revision['revid'])}) foundtext = False context = { 'change_list': change_list, } return render(request, 'blame/finger.html', context)
def __init__(self): self.client = MediaWiki('https://commons.wikimedia.org/w/api.php')
class Wiki(models.Model): MAIN_NAMESPACE = 0 IMAGE_NAMESPACE = 6 NAMESPACE_CHOICES = ( (MAIN_NAMESPACE, 'Articles'), (IMAGE_NAMESPACE, 'Images'), ) NAMESPACE_TYPES = { MAIN_NAMESPACE: 'wikimediaarticle', IMAGE_NAMESPACE:'linksgallery' } name = models.CharField(max_length=200, help_text=_(u'Nom sous lequel ce wiki est connu')) shortname = models.SlugField(max_length=50,help_text=_(u'Identifiant du wiki (charactères ASCII et chiffres seulement)')) url = models.URLField(help_text=_(u'Adresse du répertoire où se trouve api.php')) language = models.ForeignKey(ResourceLanguage) # the namespace we are interested in in this wiki namespace = models.IntegerField(choices=NAMESPACE_CHOICES,default=MAIN_NAMESPACE,help_text=_(u'Type de médias présents sur ce wiki')) # Resource slug slug = models.CharField(max_length=50,editable=False) def __init__(self,*args,**kwargs): super(Wiki,self).__init__(*args,**kwargs) self.wiki = MediaWiki(self.url + 'api.php') def save(self,*args,**kwargs): if not self.slug: self.slug = self.shortname + '.' + self.language if not self.shortname: self.shortname = self.name super(Wiki,self).save(*args,**kwargs) def wiki_links_replacer(self,match): if match.group(1) == 'wiki': return 'href="' + reverse('catalog:wikimediaarticle-view',kwargs={'slug':self.slug}) + match.group(2) + '"' elif match.group(1) == 'w': return 'href="' + self.url + match.group(2) + '" target="_blank"' else: return match.group() def __unicode__(self): # Python 3: def __str__(self): return self.name # find out the actual page title after all redirections def get_redirect_title(self, title): # first see if there is a redirect test = self.wiki.call({'action':'query','redirects':'true','titles':title}) if not 'query' in test: raise Http404 if 'redirects' in test['query']: # a redirect was encountered return test['query']['redirects'][-1]['to'] elif not 'pages' in test['query']: # no page was found return '' else: return title def slugify(self,title): return self.slug + '/' + wiki_slugify(title) @staticmethod def make_from_slug(slug): wiki_slug, slug = slug.split('/',1) title = wiki_deslugify(slug) wiki = get_object_or_404(Wiki,slug=wiki_slug) title,snippet = wiki.get_snippet(title) return wiki,title,snippet # Retrieve the whole content of a single page def get_page(self, title): # first see if the page exists test = self.wiki.call({'action':'query','redirects':'true','titles':title}) if not 'pages' in test['query']: # no page was found raise Http404 else: # yes, there is ! return special string to indicate it data = self.wiki.call({'action':'parse', 'page': title, 'prop':'text','disablepp':'true'}) # , 'redirects':'true' html = data['parse']['text']['*'] # we need to replace internal links html = p.sub(self.wiki_links_replacer,html) return html # retrieve a snippet for a single page def get_snippet(self,title): try: data = self.wiki.call({'action':'query', 'list':'search','srsearch':title,'srprop':'snippet', 'srnamespace':"%d" % self.namespace,'srlimit':'1'}) except URLError: raise Http404 data = data['query']['search'] # if we are searching imaging we need to retrieve the thumbnail URLs if not data: raise Http404 else: return (data[0]['title'],data[0]['snippet']) @staticmethod def search_all_wikis(querystring,queryloc,language): # search on available wikis in the requested language wqs = Wiki.objects.filter(Q(namespace=Wiki.IMAGE_NAMESPACE) | Q(namespace=Wiki.MAIN_NAMESPACE,language__code=language)) return chain.from_iterable([w.search(querystring) for w in wqs]) @staticmethod def list_all_wikis(language): for wiki in Wiki.objects.filter(language__code=language).values('shortname'): yield wiki['shortname'] def search(self,querystring): try: data = self.wiki.call({'action':'query', 'list':'search','srsearch':querystring,'srprop':'snippet', 'srnamespace':"%d" % self.namespace}) except URLError: raise Http404 data = data['query']['search'] # if we are searching imaging we need to retrieve the thumbnail URLs print data if self.namespace is Wiki.IMAGE_NAMESPACE: titles = [ d['title'] for d in data ] pages = self.wiki.call({'action':'query', 'titles' : string.join(titles,'|'), 'prop':'imageinfo', 'iiprop':'url', 'iiurlwidth':'300'}) urls = [ page['imageinfo'][0]['thumburl'] for page in pages['query']['pages'].values() ] for idx,d in enumerate(data): description = "<img src='" + urls[idx] + "'/>" dummy,name = string.split(d['title'],':') yield {'resource_type':Wiki.NAMESPACE_TYPES[Wiki.IMAGE_NAMESPACE], 'resource_source': 'external', 'name': name, 'slug': self.slugify(name), 'description': description } else: for d in data: yield {'resource_type':Wiki.NAMESPACE_TYPES[Wiki.MAIN_NAMESPACE], 'resource_source': 'external', 'resource_name': d['title'], 'resource_url' : reverse('catalog:wikimediaarticle-view',kwargs={'slug':self.slugify(d['title'])}), 'resource_description': d['snippet'], 'resource_tooltip': d['snippet'] + '<br><e>' + _(u'Source') + ': ' + self.shortname + '</e>'} # retrieve the url of a given image, if it exists on the wiki, otherwise raise 404 error def get_image_info(self,name): pages = self.wiki.call({'action':'query', 'titles' : 'File:'+name, 'prop':'imageinfo', 'iiprop':'url', 'iiurlwidth':'300'}) print pages if 'pages' in pages['query']: inf = pages['query']['pages'].itervalues().next()['imageinfo'][0] return inf['url'],inf['thumburl'] else: raise Http404
def __init__(self, *args, **kwargs): super(Wiki, self).__init__(*args, **kwargs) self.wiki = MediaWiki(self.url + 'api.php')
def __init__(self,*args,**kwargs): super(Wiki,self).__init__(*args,**kwargs) self.wiki = MediaWiki(self.url + 'api.php')
import re import sqlalchemy import solr from simplemediawiki import MediaWiki from editing import MusicBrainzClient import pprint import urllib import time from utils import mangle_name, join_names, contains_text_in_script, quote_page_title import config as cfg engine = sqlalchemy.create_engine(cfg.MB_DB) db = engine.connect() db.execute("SET search_path TO musicbrainz, %s" % cfg.BOT_SCHEMA_DB) wp = MediaWiki('http://ko.wikipedia.org/w/api.php') wps = solr.SolrConnection('http://localhost:8983/solr/wikipedia_ko') mb = MusicBrainzClient(cfg.MB_USERNAME, cfg.MB_PASSWORD, cfg.MB_SITE) """ CREATE TABLE bot_wp_artist_ko ( gid uuid NOT NULL, processed timestamp with time zone DEFAULT now() ); ALTER TABLE ONLY bot_wp_artist_ko ADD CONSTRAINT bot_wp_artist_kokey PRIMARY KEY (gid); """ query = """
class Wiki(object): def __init__(self, url, username, password): self.wiki = MediaWiki(url) self.username = username self.password = password self.login = self._make_wiki_login_call({'action': 'login'}) self.token = self._make_wiki_login_call( {'action': 'login', 'lgtoken': self.login['login']['token']}) def _make_wiki_login_call(self, packet): packet.update({'lgname': self.username, 'lgpassword': self.password}) response = self.wiki.call(packet) if DEBUG: print response return response def all_pages(self): response = self.wiki.call({'action': 'query', 'list': 'allpages'}) if DEBUG: print response assert_present('query', response) marker = 'foo' marker_name = 'foo' while marker: if 'query-continue' in response: for possible in ['apfrom', 'apcontinue']: if possible in response['query-continue']['allpages']: marker = \ response['query-continue']['allpages'][possible] marker_name = possible break else: marker = None for page in response['query']['allpages']: yield page['title'] response = self.wiki.call({'action': 'query', 'list': 'allpages', marker_name: marker}) if DEBUG: print response def get_page(self, title): response = self.wiki.call({'action': 'query', 'titles': title, 'prop': 'revisions', 'rvprop': 'content'}) if DEBUG: print response assert_present('query', response) pages = response['query']['pages'] page_id = pages.keys()[0] if not 'revisions' in pages[page_id]: # This is a new page return '' return pages[page_id]['revisions'][0]['*'] def check_for_page(self, title): response = self.wiki.call({'action': 'query', 'titles': title, 'prop': 'revisions', 'rvprop': 'content'}) if DEBUG: print response assert_present('query', response) pages = response['query']['pages'] page_id = pages.keys()[0] if not 'revisions' in pages[page_id]: return False return True def post_page(self, title, text, minor=True, bot=True): response = self.wiki.call({'action': 'query', 'prop': 'info', 'titles': title, 'intoken': 'edit'}) if DEBUG: print response assert_present('query', response) pages = response['query']['pages'] page_id = pages.keys()[0] response = self.wiki.call({'action': 'edit', 'minor': minor, 'bot': bot, 'title': title, 'text': json.dumps(text).replace( '\\n', '\n')[1:-1], 'token': pages[page_id]['edittoken']}) if DEBUG: print response if not 'nochange' in response['edit']: print 'Modified %s' % title def create_account(self, username, password, email, realname): response = self.wiki.call({'action': 'createaccount', 'name': username, 'password': password, 'email': email, 'realname': realname}) if DEBUG: print response response = self.wiki.call({'action': 'createaccount', 'name': username, 'password': password, 'email': email, 'realname': realname, 'token': response['createaccount']['token']}) if DEBUG: print response return 'error' not in response
from editing import MusicBrainzClient import pprint import urllib import time from mbbot.wp.wikipage import WikiPage from mbbot.wp.analysis import determine_country from utils import mangle_name, join_names, out, colored_out, bcolors, escape_query, quote_page_title, wp_is_canonical_page import config as cfg engine = sqlalchemy.create_engine(cfg.MB_DB) db = engine.connect() db.execute("SET search_path TO musicbrainz, %s" % cfg.BOT_SCHEMA_DB) wp_lang = sys.argv[1] if len(sys.argv) > 1 else 'en' wp = MediaWiki('http://%s.wikipedia.org/w/api.php' % wp_lang) suffix = '_' + wp_lang if wp_lang != 'en' else '' wps = solr.SolrConnection('http://localhost:8983/solr/wikipedia' + suffix) mb = MusicBrainzClient(cfg.MB_USERNAME, cfg.MB_PASSWORD, cfg.MB_SITE) """ CREATE TABLE bot_wp_artist_link ( gid uuid NOT NULL, lang character varying(2), processed timestamp with time zone DEFAULT now() CONSTRAINT bot_wp_artist_link_pkey PRIMARY KEY (gid, lang) ); CREATE TABLE bot_wp_artist_link_ignore ( gid uuid NOT NULL,
class wmbot: # Array to append sites to sites = [] # This file is a list of all the database names used by wikimedia # we can use this to try and derive the names of various wikis source_list = 'http://noc.wikimedia.org/conf/all.dblist' # Blank reference to store mediawiki object in wikiapiary = {} # Edit token my_token = "" # Counter create_counter = 0 # Regex pattern regex_pattern = r'^(\w+)(wiki|wikibooks|wikiquote|wiktionary|wikinews|wikisource|wikiversity|wikimedia|wikivoyage)$' # Site data siteData = { 'wiki': { 'domain': 'wikipedia.org', 'name': 'Wikipedia (%s)', 'farm': 'Wikipedia', 'logo': 'Wikipedia-logo.png' }, 'wikibooks': { 'domain': 'wikibooks.org', 'name': 'Wikibooks (%s)', 'farm': 'Wikibooks', 'logo': 'Wikibooks Logo.png' }, 'wiktionary': { 'domain': 'wiktionary.org', 'name': 'Wiktionary (%s)', 'farm': 'Wiktionary', 'logo': '170px-Wiktportal.svg.png' }, 'wikiquote': { 'domain': 'wikiquote.org', 'name': 'Wikiquote (%s)', 'farm': 'Wikiquote', 'logo': 'Wikiquote Logo.png' }, 'wikinews': { 'domain': 'wikinews.org', 'name': 'Wikinews (%s)', 'farm': 'Wikinews', 'logo': '240px-Wikinews-logo.png' }, 'wikisource': { 'domain': 'wikisource.org', 'name': 'Wikisource (%s)', 'farm': 'Wikisource', 'logo': 'Wikisource Logo.png' }, 'wikiversity': { 'domain': 'wikiversity.org', 'name': 'Wikiversity (%s)', 'farm': 'Wikiversity', 'logo': 'Wikiversity Logo.png' }, 'wikivoyage': { 'domain': 'wikivoyage.org', 'name': 'Wikivoyage (%s)', 'farm': 'Wikivoyage', 'logo': 'WikivoyageOldLogoSmall.png' }, 'wikimedia': { 'domain': 'wikimedia.org', 'name': 'Wikimedia (%s)', 'farm': 'Wikimedia', 'logo': 'Wikimediafoundation-logo.png' } } def __init__(self): config = ConfigParser.ConfigParser() config.read('../apiary.cfg') self.wikiapiary = MediaWiki(config.get('WikiApiary', 'api')) self.wikiapiary.login(config.get('wmbot', 'Username'), config.get('wmbot', 'Password')) # We need an edit token on wiki2 c = self.wikiapiary.call({ 'action': 'query', 'titles': 'Foo', 'prop': 'info', 'intoken': 'edit' }) self.my_token = c['query']['pages']['-1']['edittoken'] def getList(self): self.sites = requests.get(self.source_list).text.split('\n') def validateApi(self, api_url): # Call http://st.wikipedia.org/w/api.php?action=query&meta=siteinfo&siprop=general&format=json my_url = api_url + '?action=query&meta=siteinfo&siprop=general&format=json' try: result = requests.get(my_url).json() if 'generator' in result['query']['general']: print "Detected %s" % result['query']['general']['generator'] return True else: return False except: print "ERROR: Failed call to API check." return False def createSite(self, lang, token): siteTemplate = """{{Website |Name=%s |URL=%s |API URL=%s |Image=%s |Farm=%s |Collect general data=Yes |Collect extension data=Yes |Collect skin data=Yes |Check every=240 |Collect statistics=Yes |Audited=No |Curated=No |Active=Yes }} """ my_name = self.siteData[token]['name'] % lang my_template = siteTemplate % ( my_name, "http://%s.%s/" % (lang, self.siteData[token]['domain']), "http://%s.%s/w/api.php" % (lang, self.siteData[token]['domain']), self.siteData[token]['logo'], self.siteData[token]['farm']) print my_template self.wikiapiary.call({ 'action': 'edit', 'title': my_name, 'text': my_template, 'token': self.my_token, 'bot': 'true' }) self.create_counter += 1 def checkSite(self, lang, site_domain): # Build the API URL using Wikimedia's known convention api_url = "http://%s.%s/w/api.php" % (lang, site_domain) print "Testing %s" % api_url # First see if this is a valid API URL before we query WikiApiary isValid = self.validateApi(api_url) if isValid: # Construct Ask query for WikiApiary my_query = ''.join(["[[Has API URL::%s]]" % api_url]) # Execute the query against WikiApiary c = self.wikiapiary.call({'action': 'ask', 'query': my_query}) # Return the count of results for the query return True, int(c['query']['meta']['count']) else: return False, 0 def processSite(self, token): match = re.findall(self.regex_pattern, token) if len(match[0]) == 2: return match[0] else: return (False, False) def main(self): # Get the list of tokens from the config file self.getList() # Now loop through the tokens for token in self.sites: print "\nProcessing %s" % token # First turn a token into a lang and site (lang, site) = self.processSite(token) lang = lang.replace('_', '-') # If we successfully got lang and site proceed if lang is not False and site is not False: # Use a guess of the API domain to see if (valid, siteCount) = self.checkSite(lang, self.siteData[site]['domain']) if valid: if siteCount == 0: print "%s appears to be untracked token." % token # Now add it to WikiApiary self.createSite(lang, site) elif siteCount == 1: print "%s already exists." % token elif siteCount > 1: print "%s found %d websites, which should never happen." % ( token, siteCount) else: print "%s did not resolve to a valid API URL." % token else: print "%s could not process token." % token
class Wiki(models.Model): MAIN_NAMESPACE = 0 IMAGE_NAMESPACE = 6 NAMESPACE_CHOICES = ( (MAIN_NAMESPACE, 'Articles'), (IMAGE_NAMESPACE, 'Images'), ) NAMESPACE_TYPES = { MAIN_NAMESPACE: 'wikimediaarticle', IMAGE_NAMESPACE: 'linksgallery' } name = models.CharField(max_length=200, help_text=_(u'Nom sous lequel ce wiki est connu')) shortname = models.SlugField( max_length=50, help_text=_( u'Identifiant du wiki (charactères ASCII et chiffres seulement)')) url = models.URLField( help_text=_(u'Adresse du répertoire où se trouve api.php')) language = models.ForeignKey(ResourceLanguage) # the namespace we are interested in in this wiki namespace = models.IntegerField( choices=NAMESPACE_CHOICES, default=MAIN_NAMESPACE, help_text=_(u'Type de médias présents sur ce wiki')) # Resource slug slug = models.CharField(max_length=50, editable=False) def __init__(self, *args, **kwargs): super(Wiki, self).__init__(*args, **kwargs) self.wiki = MediaWiki(self.url + 'api.php') def save(self, *args, **kwargs): if not self.slug: self.slug = self.shortname + '.' + self.language if not self.shortname: self.shortname = self.name super(Wiki, self).save(*args, **kwargs) def wiki_links_replacer(self, match): if match.group(1) == 'wiki': return 'href="' + reverse('catalog:wikimediaarticle-view', kwargs={'slug': self.slug }) + match.group(2) + '"' elif match.group(1) == 'w': return 'href="' + self.url + match.group(2) + '" target="_blank"' else: return match.group() def __unicode__(self): # Python 3: def __str__(self): return self.name # find out the actual page title after all redirections def get_redirect_title(self, title): # first see if there is a redirect test = self.wiki.call({ 'action': 'query', 'redirects': 'true', 'titles': title }) if not 'query' in test: raise Http404 if 'redirects' in test['query']: # a redirect was encountered return test['query']['redirects'][-1]['to'] elif not 'pages' in test['query']: # no page was found return '' else: return title def slugify(self, title): return self.slug + '/' + wiki_slugify(title) @staticmethod def make_from_slug(slug): wiki_slug, slug = slug.split('/', 1) title = wiki_deslugify(slug) wiki = get_object_or_404(Wiki, slug=wiki_slug) title, snippet = wiki.get_snippet(title) return wiki, title, snippet # Retrieve the whole content of a single page def get_page(self, title): # first see if the page exists test = self.wiki.call({ 'action': 'query', 'redirects': 'true', 'titles': title }) if not 'pages' in test['query']: # no page was found raise Http404 else: # yes, there is ! return special string to indicate it data = self.wiki.call({ 'action': 'parse', 'page': title, 'prop': 'text', 'disablepp': 'true' }) # , 'redirects':'true' html = data['parse']['text']['*'] # we need to replace internal links html = p.sub(self.wiki_links_replacer, html) return html # retrieve a snippet for a single page def get_snippet(self, title): try: data = self.wiki.call({ 'action': 'query', 'list': 'search', 'srsearch': title, 'srprop': 'snippet', 'srnamespace': "%d" % self.namespace, 'srlimit': '1' }) except URLError: raise Http404 data = data['query']['search'] # if we are searching imaging we need to retrieve the thumbnail URLs if not data: raise Http404 else: return (data[0]['title'], data[0]['snippet']) @staticmethod def search_all_wikis(querystring, queryloc, language): # search on available wikis in the requested language wqs = Wiki.objects.filter( Q(namespace=Wiki.IMAGE_NAMESPACE) | Q(namespace=Wiki.MAIN_NAMESPACE, language__code=language)) return chain.from_iterable([w.search(querystring) for w in wqs]) @staticmethod def list_all_wikis(language): for wiki in Wiki.objects.filter( language__code=language).values('shortname'): yield wiki['shortname'] def search(self, querystring): try: data = self.wiki.call({ 'action': 'query', 'list': 'search', 'srsearch': querystring, 'srprop': 'snippet', 'srnamespace': "%d" % self.namespace }) except URLError: raise Http404 data = data['query']['search'] # if we are searching imaging we need to retrieve the thumbnail URLs print data if self.namespace is Wiki.IMAGE_NAMESPACE: titles = [d['title'] for d in data] pages = self.wiki.call({ 'action': 'query', 'titles': string.join(titles, '|'), 'prop': 'imageinfo', 'iiprop': 'url', 'iiurlwidth': '300' }) urls = [ page['imageinfo'][0]['thumburl'] for page in pages['query']['pages'].values() ] for idx, d in enumerate(data): description = "<img src='" + urls[idx] + "'/>" dummy, name = string.split(d['title'], ':') yield { 'resource_type': Wiki.NAMESPACE_TYPES[Wiki.IMAGE_NAMESPACE], 'resource_source': 'external', 'name': name, 'slug': self.slugify(name), 'description': description } else: for d in data: yield { 'resource_type': Wiki.NAMESPACE_TYPES[Wiki.MAIN_NAMESPACE], 'resource_source': 'external', 'resource_name': d['title'], 'resource_url': reverse('catalog:wikimediaarticle-view', kwargs={'slug': self.slugify(d['title'])}), 'resource_description': d['snippet'], 'resource_tooltip': d['snippet'] + '<br><e>' + _(u'Source') + ': ' + self.shortname + '</e>' } # retrieve the url of a given image, if it exists on the wiki, otherwise raise 404 error def get_image_info(self, name): pages = self.wiki.call({ 'action': 'query', 'titles': 'File:' + name, 'prop': 'imageinfo', 'iiprop': 'url', 'iiurlwidth': '300' }) print pages if 'pages' in pages['query']: inf = pages['query']['pages'].itervalues().next()['imageinfo'][0] return inf['url'], inf['thumburl'] else: raise Http404
def get_single_lab(lab_slug, data_format): """Gets data from a single lab from hackerspaces.org.""" wiki = MediaWiki(hackerspaces_org_api_url) wiki_response = wiki.call({ 'action': 'query', 'titles': lab_slug, 'prop': 'revisions', 'rvprop': 'content' }) # If we don't know the pageid... for i in wiki_response["query"]["pages"]: content = wiki_response["query"]["pages"][i]["revisions"][0]["*"] # Transform the data into a Lab object current_lab = Lab() equipment_list = [] # Parse the Mediawiki code wikicode = mwparserfromhell.parse(content) for k in wikicode.filter_templates(): element_name = unicode(k.name) if "Hackerspace" in element_name: for j in k.params: if unicode(j.name) == "logo": current_lab.logo = unicode(j.value) if unicode(j.name) == "country": current_lab.country = unicode(j.value) if unicode(j.name) == "state": current_lab.state = unicode(j.value) if unicode(j.name) == "city": current_lab.city = unicode(j.value) if unicode(j.name) == "founding": current_lab.city = unicode(j.value) if unicode(j.name) == "coordinate": value = unicode(j.value) current_lab.coordinates = value latlong = [] if ", " in value: latlong = value.rstrip(", ").split(", ") elif " , " in value: latlong = value.rstrip(" , ").split(" , ") else: latlong = ["", ""] current_lab.lat = latlong[0] current_lab.long = latlong[1] if unicode(j.name) == "membercount": current_lab.membercount = unicode(j.value) if unicode(j.name) == "fee": current_lab.fee = unicode(j.value) if unicode(j.name) == "size": current_lab.size = unicode(j.value) if unicode(j.name) == "status": current_lab.status = unicode(j.value) if unicode(j.name) == "site": current_lab.site = unicode(j.value) if unicode(j.name) == "wiki": current_lab.wiki = unicode(j.value) if unicode(j.name) == "irc": current_lab.irc = unicode(j.value) if unicode(j.name) == "jabber": current_lab.jabber = unicode(j.value) if unicode(j.name) == "phone": current_lab.phone = unicode(j.value) if unicode(j.name) == "youtube": current_lab.youtube = unicode(j.value) if unicode(j.name) == "eventbrite": current_lab.eventbrite = unicode(j.value) if unicode(j.name) == "facebook": current_lab.facebook = unicode(j.value) if unicode(j.name) == "ustream": current_lab.ustream = unicode(j.value) if unicode(j.name) == "flickr": current_lab.flickr = unicode(j.value) if unicode(j.name) == "twitter": current_lab.twitter = unicode(j.value) if unicode(j.name) == "googleplus": current_lab.googleplus = unicode(j.value) if unicode(j.name) == "email": current_lab.email = unicode(j.value) if unicode(j.name) == "maillist": current_lab.maillist = unicode(j.value) if unicode(j.name) == "ical": current_lab.ical = unicode(j.value) if unicode(j.name) == "forum": current_lab.forum = unicode(j.value) if unicode(j.name) == "street-address": current_lab.street_address = unicode(j.value) if unicode(j.name) == "postalcode": current_lab.postalcode = unicode(j.value) if unicode(j.name) == "region": current_lab.region = unicode(j.value) if unicode(j.name) == "post-office-box": current_lab.post_office_box = unicode(j.value) elif "Equipment" in element_name: for j in k.params: equipment_list.append(j.replace("equipment=", "")) current_lab.equipment = equipment_list # Load the free text freetext = "" for k in wikicode._nodes: try: test_value = k.name except AttributeError: freetext += unicode(k) current_lab.text = freetext if data_format == "dict": return current_lab.__dict__ elif data_format == "object": return current_lab
#! /usr/bin/env python # Imports from simplemediawiki import MediaWiki, build_user_agent import sys from timestamper import * import datetime import time import pprint # Get wiki location location = raw_input("Base URL to the wiki API (YOUR_WIKI_ROOT/api.php): ") if (location[0:7].lower() != "http://"): location = "http://" + location wiki = MediaWiki(location) if wiki.normalize_api_url() is None: sys.exit("Invalid Wiki URL") # Get login credetials ua = build_user_agent("uturn", "0.1", "https://github.com/tomasreimers/wiki-uturn"); while True: username = raw_input("Username: "******"Password: "******"Invalid login" # Get date to revert to print "When would you like to revert to (IN UTC)?" year = int(raw_input("Year: "))
class ApiaryBot: args = [] config = [] apiary_wiki = [] apiary_db = [] stats = {} edit_token = '' def __init__(self): # Get command line options self.get_args() # Get configuration settings self.get_config(self.args.config) # Connect to the database self.connectdb() # Initialize stats self.stats['statistics'] = 0 self.stats['smwinfo'] = 0 self.stats['smwusage'] = 0 self.stats['general'] = 0 self.stats['extensions'] = 0 self.stats['skins'] = 0 self.stats['skippedstatistics'] = 0 self.stats['skippedgeneral'] = 0 self.stats['whois'] = 0 self.stats['maxmind'] = 0 self.stats['interwikimap'] = 0 self.stats['libraries'] = 0 self.stats['namespaces'] = 0 def get_config(self, config_file='../apiary.cfg'): try: self.config = ConfigParser.ConfigParser() self.config.read(config_file) except IOError: print("Cannot open %s." % config_file) def get_args(self): parser = argparse.ArgumentParser( prog="Bumble Bee", description= "retrieves usage and statistic information for WikiApiary") parser.add_argument("-s", "--segment", help="only work on websites in defined segment") parser.add_argument("--site", help="only work on this specific site id") parser.add_argument( "-f", "--force", action="store_true", help="run regardless of when the last time data was updated") parser.add_argument( "-d", "--debug", action="store_true", help="do not write any changes to wiki or database") parser.add_argument("--config", default="../apiary.cfg", help="use an alternative config file") parser.add_argument("-v", "--verbose", action="count", default=0, help="increase output verbosity") parser.add_argument("--version", action="version", version="%(prog)s 0.1") # All set, now get the arguments self.args = parser.parse_args() def filter_illegal_chars(self, pre_filter): # Utility function to make sure that strings are okay for page titles return re.sub(r'[#<>\[\]\|{}]', '', pre_filter).replace('=', '-') def sqlutcnow(self): now = datetime.datetime.utcnow() now = now.replace(tzinfo=pytz.utc) now = now.replace(microsecond=0) return now.strftime('%Y-%m-%d %H:%M:%S') def make_request(self, site, data_url, bot='Bumble Bee'): req = urllib2.Request(data_url) req.add_header('User-Agent', self.config.get(bot, 'User-Agent')) req.add_header('Accept-Encoding', 'gzip') opener = urllib2.build_opener() try: t1 = datetime.datetime.now() f = opener.open(req) duration = (datetime.datetime.now() - t1).total_seconds() except ssl.SSLError as e: msg = "SSL Error: " + str(e) self.record_error(site=site, log_message=msg, log_type='info', log_severity='normal', log_bot=bot, log_url=data_url) return None, None except urllib2.URLError as e: self.record_error(site=site, log_message="URLError: %s" % e, log_type='error', log_severity='normal', log_bot=bot, log_url=data_url) return None, None except urllib2.HTTPError as e: if e.code > 399 and e.code < 500: raise FourHundred(e) if e.code > 499 and e.code < 600: raise FiveHundred(e) self.record_error(site=site, log_message="%s" % e, log_type='error', log_severity='normal', log_bot=bot, log_url=data_url) return None, None except Exception as e: self.record_error(site=site, log_message=str(e), log_type='info', log_severity='normal', log_bot=bot, log_url=data_url) return None, None else: return f, duration def pull_json(self, site, data_url, bot='Bumble Bee'): socket.setdefaulttimeout(10) (f, duration) = self.make_request(site, data_url, bot) if f is None: return False, None, None else: # Clean the returned string before we parse it, # sometimes there are junky error messages from PHP in # here, or simply a newline that shouldn't be present # The regex here is really simple, but it seems to # work fine. if f.info().get('Content-Encoding') == 'gzip': buf = StringIO(f.read()) gz = gzip.GzipFile(fileobj=buf) ret_string = gz.read() else: ret_string = f.read() json_match = re.search(r"({.*})", ret_string, flags=re.MULTILINE) if json_match is None or json_match.group(1) is None: raise NoJSON(data_url + "||" + ret_string) # Found JSON block try: data = simplejson.loads(json_match.group(1)) except ValueError as e: raise NoJSON(data_url + "||" + ret_string) return True, data, duration def runSql(self, sql_command, args=None): if self.args.verbose >= 3: print("SQL: %s" % sql_command) try: cur = self.apiary_db.cursor() cur.execute('SET NAMES utf8mb4') cur.execute("SET CHARACTER SET utf8mb4") cur.execute("SET character_set_connection=utf8mb4") cur.execute(sql_command, args) cur.close() self.apiary_db.commit() return True, cur.rowcount except Exception as e: print("Exception generated while running SQL command.") print("Command: %s" % sql_command) print("Exception: %s" % e) return False, 0 def record_error(self, site=None, log_message='Unknown Error', log_type='info', log_severity='normal', log_bot=None, log_url=None): if self.args.verbose >= 2: print("New log message for %s" % site['pagename']) if self.args.verbose >= 1: print(log_message) if site is None: site = {} site = {'Has ID': 0} if 'Has name' in site: site['pagename'] = site['Has name'] elif 'pagename' not in site: site['pagename'] = 'Error' if log_bot is None: log_bot = "null" else: log_bot = "'%s'" % log_bot if log_url is None: log_url = "null" else: log_url = "'%s'" % log_url temp_sql = "INSERT apiary_website_logs " temp_sql += "(website_id, log_date, website_name, log_type, " temp_sql += "log_severity, log_message, log_bot, log_url) " if len(log_message) > 65535: print("log_message too long: %s" % log_message) log_message = log_message[0:65535] # The format string is not really a normal Python format # string. You must always use %s http://stackoverflow.com/a/5785163 temp_sql += "VALUES (%s, %s, %s, %s, %s, %s, %s, %s)" args = (site['Has ID'], self.sqlutcnow(), site['pagename'], log_type, log_severity, log_message, log_bot, log_url) self.runSql(temp_sql, args) def clear_error(self, sitename): # This function clears the error status of a meeting socket.setdefaulttimeout(30) if self.args.verbose >= 2: print("Clearing error for %s" % sitename) c = self.apiary_wiki.call({ 'action': 'sfautoedit', 'form': 'Website', 'target': sitename, 'Website[Error]': 'No', 'wpSummary': 'clearing error' }) if self.args.verbose >= 3: print("result:%s" % c) def connectdb(self): # Setup our database connection # Use the account that can also insert and delete from the database self.apiary_db = mdb.connect( host=self.config.get('ApiaryDB', 'hostname'), db=self.config.get('ApiaryDB', 'database'), user=self.config.get('ApiaryDB RW', 'username'), passwd=self.config.get('ApiaryDB RW', 'password'), charset='utf8') def connectwiki(self, bot_name): self.apiary_wiki = MediaWiki(self.config.get('WikiApiary', 'API')) c = self.apiary_wiki.login(self.config.get(bot_name, 'Username'), self.config.get(bot_name, 'Password')) if self.args.verbose >= 1: print("Username: %s Password: %s" % (self.config.get( bot_name, 'Username'), self.config.get(bot_name, 'Password'))) print(c) def get_websites(self, segment, site): filter_string = "" if site is not None: if self.args.verbose >= 1: print("Processing site %d." % int(site)) filter_string = "[[Has ID::%d]]" % int(site) elif segment is not None: if self.args.verbose >= 1: print("Only retrieving segment %d." % int(self.args.segment)) filter_string = "[[Has bot segment::%d]]" % int(self.args.segment) #filter_string = "test" # Build query for sites my_query = ''.join([ '[[Category:Website]]', '[[Is defunct::False]]', '[[Is active::True]]', filter_string, '|?Has API URL', '|?Has statistics URL', '|?Check every', '|?Creation date', '|?Has ID', '|?Collect general data', '|?Collect extension data', '|?Collect skin data', '|?Collect statistics', '|?Collect semantic statistics', '|?Collect semantic usage', '|?Collect logs', '|?Collect recent changes', '|?Collect statistics stats', '|sort=Creation date', '|order=asc', '|limit=2000' ]) if self.args.verbose >= 3: print("Query: %s" % my_query) try: sites = self.apiary_wiki.call({'action': 'ask', 'query': my_query}) except Exception as e: self.record_error(log_message="Problem querying Wikiapiary: %s" % e, log_type='error', log_severity='important') else: # We could just return the raw JSON object from the API, however instead we are going to clean it up into an # easier to deal with array of dictionary objects. # To keep things sensible, we'll use the same name as the properties i = 0 if len(sites['query']['results']) > 0: my_sites = [] for pagename, site in sites['query']['results'].items(): i += 1 if self.args.verbose >= 3: print("[%d] Adding %s." % (i, pagename)) # Initialize the flags but do it carefully in case there is no value in the wiki yet collect_general_data = list_get( site['printouts'], 'Collect general data') == "t" collect_extension_data = list_get( site['printouts'], 'Collect extension data') == "t" collect_skin_data = list_get(site['printouts'], 'Collect skin data') == "t" collect_statistics = list_get(site['printouts'], 'Collect statistics') == "t" collect_semantic_statistics = list_get( site['printouts'], 'Collect semantic statistics') == "t" collect_semantic_usage = list_get( site['printouts'], 'Collect semantic usage') == "t" collect_statistics_stats = list_get( site['printouts'], 'Collect statistics stats') == "t" collect_logs = list_get(site['printouts'], 'Collect logs') == "t" collect_recent_changes = list_get( site['printouts'], 'Collect recent changes') == "t" has_statistics_url = list_get(site['printouts'], 'Has statistics URL', '') has_api_url = list_get(site['printouts'], 'Has API URL', '') if has_statistics_url.find('wikkii.com') > 0: # Temporary filter out all Farm:Wikkii sites if self.args.verbose >= 2: print("Skipping %s (%s)" % (pagename, site['fullurl'])) else: try: my_sites.append({ 'pagename': pagename, 'fullurl': site['fullurl'], 'Has API URL': has_api_url, 'Has statistics URL': has_statistics_url, 'Check every': int(site['printouts']['Check every'][0]), 'Creation date': site['printouts']['Creation date'][0], 'Has ID': int(site['printouts']['Has ID'][0]), 'Collect general data': collect_general_data, 'Collect extension data': collect_extension_data, 'Collect skin data': collect_skin_data, 'Collect statistics': collect_statistics, 'Collect semantic statistics': collect_semantic_statistics, 'Collect semantic usage': collect_semantic_usage, 'Collect statistics stats': collect_statistics_stats, 'Collect logs': collect_logs, 'Collect recent changes': collect_recent_changes }) except Exception as e: print("Failed to add %s" % pagename) print(e) self.record_error(site=site, log_message="Failed to add page", log_type='warn', log_severity='important', log_bot='apiary.py', log_url=data_url) return my_sites else: raise Exception("No sites were returned to work on.") def get_status(self, site): """ get_status will query the website_status table in ApiaryDB. It makes the decision if new data should be retrieved for a given website. Two booleans are returned, the first to tell if new statistics information should be requested and the second to pull general information. """ # Get the timestamps for the last statistics and general pulls cur = self.apiary_db.cursor() temp_sql = "SELECT last_statistics, last_general, check_every_limit FROM website_status WHERE website_id = %d" % site[ 'Has ID'] cur.execute(temp_sql) rows_returned = cur.rowcount if rows_returned == 1: # Let's see if it's time to pull information again data = cur.fetchone() cur.close() (last_statistics, last_general, check_every_limit) = data[0:3] if self.args.verbose >= 3: print("last_stats: %s" % last_statistics) print("last_general: %s" % last_general) print("check_every_limit: %s" % check_every_limit) #TODO: make this check the times! last_statistics_struct = time.strptime(str(last_statistics), '%Y-%m-%d %H:%M:%S') last_general_struct = time.strptime(str(last_general), '%Y-%m-%d %H:%M:%S') stats_delta = (time.mktime(time.gmtime()) - time.mktime(last_statistics_struct)) / 60 general_delta = (time.mktime(time.gmtime()) - time.mktime(last_general_struct)) / 60 if self.args.verbose >= 2: print("Delta from checks: stats %s general %s" % (stats_delta, general_delta)) (check_stats, check_general) = (False, False) if stats_delta > ( site['Check every'] + random.randint(0, 15) ) and stats_delta > check_every_limit: # Add randomness to keep checks spread around check_stats = True else: if self.args.verbose >= 2: print("Skipping stats...") self.stats['skippedstatistics'] += 1 if general_delta > ( (24 + random.randint(0, 24)) * 60 ): # General checks are always bound to 24 hours, plus a random offset to keep checks evenly distributed check_general = True else: if self.args.verbose >= 2: print("Skipping general...") self.stats['skippedgeneral'] += 1 return (check_stats, check_general) elif rows_returned == 0: cur.close() # This website doesn't have a status, so we should check everything if self.args.verbose >= 3: print("website has never been checked before") return (True, True) else: raise Exception("Status check returned multiple rows.") def update_status(self, site, checktype): # Update the website_status table my_now = self.sqlutcnow() if checktype == "statistics": temp_sql = "UPDATE website_status SET last_statistics = '%s' WHERE website_id = %d" % ( my_now, site['Has ID']) if checktype == "general": temp_sql = "UPDATE website_status SET last_general = '%s' WHERE website_id = %d" % ( my_now, site['Has ID']) (success, rows_affected) = self.runSql(temp_sql) if rows_affected == 0: # No rows were updated, this website likely didn't exist before, so we need to insert the first time if self.args.verbose >= 2: print( "No website_status record exists for ID %d, creating one" % site['Has ID']) temp_sql = "INSERT website_status (website_id, last_statistics, last_general, check_every_limit) " temp_sql += "VALUES (%d, \"%s\", \"%s\", %d) " % ( site['Has ID'], my_now, my_now, 240) temp_sql += "ON DUPLICATE KEY UPDATE last_statistics=\"%s\", last_general=\"%s\", check_every_limit=%d" % ( my_now, my_now, 240) self.runSql(temp_sql) def botlog(self, bot, message, type='info', duration=0): if self.args.verbose >= 1: print(message) temp_sql = "INSERT apiary_bot_log (log_date, log_type, bot, duration, message) " temp_sql += "VALUES (\"%s\", \"%s\", \"%s\", %f, \"%s\")" % ( self.sqlutcnow(), type, bot, duration, message) self.runSql(temp_sql)
class wmbot: # Array to append sites to sites = [] # This file is a list of all the database names used by wikimedia # we can use this to try and derive the names of various wikis source_list = 'http://noc.wikimedia.org/conf/all.dblist' # Blank reference to store mediawiki object in wikiapiary = {} # Edit token my_token = "" # Counter create_counter = 0 # Regex pattern regex_pattern = r'^(\w+)(wiki|wikibooks|wikiquote|wiktionary|wikinews|wikisource|wikiversity|wikimedia|wikivoyage)$' # Site data siteData = { 'wiki': { 'domain': 'wikipedia.org', 'name': 'Wikipedia (%s)', 'farm': 'Wikipedia', 'logo': 'Wikipedia-logo.png' }, 'wikibooks': { 'domain': 'wikibooks.org', 'name': 'Wikibooks (%s)', 'farm': 'Wikibooks', 'logo': 'Wikibooks Logo.png' }, 'wiktionary': { 'domain': 'wiktionary.org', 'name': 'Wiktionary (%s)', 'farm': 'Wiktionary', 'logo': '170px-Wiktportal.svg.png' }, 'wikiquote': { 'domain': 'wikiquote.org', 'name': 'Wikiquote (%s)', 'farm': 'Wikiquote', 'logo': 'Wikiquote Logo.png' }, 'wikinews': { 'domain': 'wikinews.org', 'name': 'Wikinews (%s)', 'farm': 'Wikinews', 'logo': '240px-Wikinews-logo.png' }, 'wikisource': { 'domain': 'wikisource.org', 'name': 'Wikisource (%s)', 'farm': 'Wikisource', 'logo': 'Wikisource Logo.png' }, 'wikiversity': { 'domain': 'wikiversity.org', 'name': 'Wikiversity (%s)', 'farm': 'Wikiversity', 'logo': 'Wikiversity Logo.png' }, 'wikivoyage': { 'domain': 'wikivoyage.org', 'name': 'Wikivoyage (%s)', 'farm': 'Wikivoyage', 'logo': 'WikivoyageOldLogoSmall.png' }, 'wikimedia': { 'domain': 'wikimedia.org', 'name': 'Wikimedia (%s)', 'farm': 'Wikimedia', 'logo': 'Wikimediafoundation-logo.png' } } def __init__(self): config = ConfigParser.ConfigParser() config.read('../apiary.cfg') self.wikiapiary = MediaWiki(config.get('WikiApiary', 'api')) self.wikiapiary.login(config.get('wmbot', 'Username'), config.get('wmbot', 'Password')) # We need an edit token on wiki2 c = self.wikiapiary.call({ 'action': 'query', 'titles': 'Foo', 'prop': 'info', 'intoken': 'edit' }) self.my_token = c['query']['pages']['-1']['edittoken'] def getList(self): self.sites = requests.get(self.source_list).text.split('\n') def validateApi(self, api_url): # Call http://st.wikipedia.org/w/api.php?action=query&meta=siteinfo&siprop=general&format=json my_url = api_url + '?action=query&meta=siteinfo&siprop=general&format=json' try: result = requests.get(my_url).json() if 'generator' in result['query']['general']: print "Detected %s" % result['query']['general']['generator'] return True else: return False except: print "ERROR: Failed call to API check." return False def createSite(self, lang, token): siteTemplate = """{{Website |Name=%s |URL=%s |API URL=%s |Image=%s |Farm=%s |Collect general data=Yes |Collect extension data=Yes |Collect skin data=Yes |Check every=240 |Collect statistics=Yes |Audited=No |Curated=No |Active=Yes }} """ my_name = self.siteData[token]['name'] % lang my_template = siteTemplate % ( my_name, "http://%s.%s/" % (lang, self.siteData[token]['domain']), "http://%s.%s/w/api.php" % (lang, self.siteData[token]['domain']), self.siteData[token]['logo'], self.siteData[token]['farm']) print my_template self.wikiapiary.call({ 'action': 'edit', 'title': my_name, 'text': my_template, 'token': self.my_token, 'bot': 'true' }) self.create_counter += 1 def checkSite(self, lang, site_domain): # Build the API URL using Wikimedia's known convention api_url = "http://%s.%s/w/api.php" % (lang, site_domain) print "Testing %s" % api_url # First see if this is a valid API URL before we query WikiApiary isValid = self.validateApi(api_url) if isValid: # Construct Ask query for WikiApiary my_query = ''.join([ "[[Has API URL::%s]]" % api_url ]) # Execute the query against WikiApiary c = self.wikiapiary.call({ 'action': 'ask', 'query': my_query }) # Return the count of results for the query return True, int(c['query']['meta']['count']) else: return False, 0 def processSite(self, token): match = re.findall(self.regex_pattern, token) if len(match[0]) == 2: return match[0] else: return (False, False) def main(self): # Get the list of tokens from the config file self.getList() # Now loop through the tokens for token in self.sites: print "\nProcessing %s" % token # First turn a token into a lang and site (lang, site) = self.processSite(token) lang = lang.replace('_', '-') # If we successfully got lang and site proceed if lang is not False and site is not False: # Use a guess of the API domain to see if (valid, siteCount) = self.checkSite(lang, self.siteData[site]['domain']) if valid: if siteCount == 0: print "%s appears to be untracked token." % token # Now add it to WikiApiary self.createSite(lang, site) elif siteCount == 1: print "%s already exists." % token elif siteCount > 1: print "%s found %d websites, which should never happen." % (token, siteCount) else: print "%s did not resolve to a valid API URL." % token else: print "%s could not process token." % token
#!/usr/bin/python from simplemediawiki import MediaWiki from tabela import tabela from people import people import sys text = '==== Листа на дежурства ====\n\nОва е автоматски генерерирана листа на дежурни со две ротации, доколку не сте во можност да бидете дежурни некоја недела или ден запишете во забелешка и пишете на мејлинг листа. Доколку сте дежурен во вашиот google calendar е вметнат нов календар насловен „Хаклаб: Дежурства“ со настан за деновите кога сте дежурни. Поставете ги известувањата за да бидете навреме известени.\n\n' text+=tabela(people) wiki = MediaWiki('https://wiki.spodeli.org/api.php') user, password = open('credentials', 'r').read().split() wiki.login(user,password) token = wiki.call({'action': 'query', 'meta': 'tokens'})['query']['tokens']['csrftoken'] wiki.call({'action': 'edit', 'title': 'Хаклаб/Дежурства', 'section':'5', 'text':text, 'token':token})
def __init__(self): self.client = MediaWiki('https://www.wikidata.org/w/api.php')
def get_labs(format): """Gets data from all labs from makeinitaly.foundation.""" labs = [] # Get the first page of data wiki = MediaWiki(makeinitaly__foundation_api_url) wiki_response = wiki.call({ 'action': 'query', 'list': 'categorymembers', 'cmtitle': 'Category:Italian_FabLabs', 'cmlimit': '500' }) if "query-continue" in wiki_response: nextpage = wiki_response["query-continue"]["categorymembers"][ "cmcontinue"] urls = [] for i in wiki_response["query"]["categorymembers"]: urls.append(i["title"].replace(" ", "_")) # Load all the Labs in the first page for i in urls: current_lab = get_single_lab(i) labs.append(current_lab) # Load all the Labs from the other pages while "query-continue" in wiki_response: wiki = MediaWiki(makeinitaly__foundation_api_url) wiki_response = wiki.call({ 'action': 'query', 'list': 'categorymembers', 'cmtitle': 'Category:Hackerspace', 'cmlimit': '500', "cmcontinue": nextpage }) urls = [] for i in wiki_response["query"]["categorymembers"]: urls.append(i["title"].replace(" ", "_")) # Load all the Labs for i in urls: current_lab = get_single_lab(i, data_format) labs.append(current_lab) if "query-continue" in wiki_response: nextpage = wiki_response["query-continue"]["categorymembers"][ "cmcontinue"] else: break # Transform the list into a dictionary labs_dict = {} for j, k in enumerate(labs): labs_dict[j] = k.__dict__ # Return a dictiornary / json if format.lower() == "dict" or format.lower() == "json": output = labs_dict # Return a geojson elif format.lower() == "geojson" or format.lower() == "geo": labs_list = [] for l in labs_dict: single = labs_dict[l].__dict__ single_lab = Feature(type="Feature", geometry=Point((single["latitude"], single["longitude"])), properties=single) labs_list.append(single_lab) output = dumps(FeatureCollection(labs_list)) # Return a Pandas DataFrame elif format.lower() == "pandas" or format.lower() == "dataframe": output = {} for j in labs_dict: output[j] = labs_dict[j].__dict__ # Transform the dict into a Pandas DataFrame output = pd.DataFrame.from_dict(output) output = output.transpose() # Return an object elif format.lower() == "object" or format.lower() == "obj": output = labs # Default: return an object else: output = labs # Return a proper json if format.lower() == "json": output = json.dumps(labs_dict) return output
import datetime import glob import json import os import re import sys import textwrap from simplemediawiki import MediaWiki with open(os.path.expanduser('~/.mediawiki'), 'r') as f: conf = json.loads(f.read())['ircbot'] wiki = MediaWiki(conf['url']) day_re = re.compile('^--- Day changed (.*) (.*) ([0-9]+) (20[0-9]+)$') human_re = re.compile('.*<([^>]+)>.*') days = {} days_order = [] months = {'Jan': 1, 'Feb': 2, 'Mar': 3, 'Apr': 4, 'May': 5, 'Jun': 6, 'Jul': 7, 'Aug': 8, 'Sep': 9, 'Oct': 10, 'Nov': 11, 'Dec': 12} def make_wiki_login_call(packet): packet.update({'lgname': conf['username'], 'lgpassword': conf['password']}) return wiki.call(packet)
parser.add_argument('--filename', '-f', dest='filename', help='Output filename', default='list_titles.txt') args = parser.parse_args() lang_val = args.lang filename = args.filename wiki_url = "https://" wiki_url = wiki_url + lang_val wiki_url = wiki_url + ".wikipedia.org/w/api.php" wiki = MediaWiki(wiki_url) output_file = open(filename, "w") continue_param = '' request_obj = {} request_obj['action'] = 'query' request_obj['list'] = 'allpages' request_obj['aplimit'] = 'max' request_obj['apnamespace'] = '0' page_list = wiki.call(request_obj) pages_in_query = page_list['query']['allpages'] for each_page in pages_in_query: page_ID = each_page['pageid']
class wikkii: # Array to append sites to sites = [] # This file is a list of all the database names used by wikimedia # we can use this to try and derive the names of various wikis source_list = 'http://wikkii.com/wiki/Special:Farmer/list' # Blank reference to store mediawiki object in wikiapiary = {} # Edit token my_token = "" # Counter create_counter = 0 def __init__(self): config = ConfigParser.ConfigParser() config.read('../apiary.cfg') self.wikiapiary = MediaWiki(config.get('WikiApiary', 'api')) self.wikiapiary.login(config.get('wikkiibot', 'Username'), config.get('wikkiibot', 'Password')) # We need an edit token on wiki2 c = self.wikiapiary.call({ 'action': 'query', 'titles': 'Foo', 'prop': 'info', 'intoken': 'edit' }) self.my_token = c['query']['pages']['-1']['edittoken'] def getList(self): soup = BeautifulSoup(requests.get(self.source_list).text) i = 1 for item in soup.findAll("a", {"class": "extiw"}): site = (item.contents[0], item["href"], item["title"]) print i, site self.sites.append(site) i += 1 def validateStats(self, url): my_url = "%s/wiki/Special:Statistics?action=raw" % url try: result = requests.get(my_url, timeout=10).text values = result.split(';') if len(values) == 9: print "Got %d values from stats" % len(values) return True else: return False except: print "ERROR: Failed call to Statistics URL." return False def createSite(self, name, url): siteTemplate = """{{Website |Name=%s |URL=%s |Image=Default website image.png |Farm=Wikkii |Collection method=API, Special:Statistics |API URL=%s |Collect general data=No |Collect extension data=No |Collect skin data=No |Collect statistics=No |Collect semantic statistics=No |Collect semantic usage=No |Collect logs=No |Collect recent changes=No |Statistics URL=%s |Collect statistics stats=Yes |Check every=240 |Audited=No |Validated=No |Curated=No |Active=Yes |Demote=No |Defunct=No |Error=No |Featured website vote=0 }} """ api_url = "%sw/api.php" % url statistics_url = "%swiki/Special:Statistics" % url # Make sure a page doesn't exist with this name already c = self.wikiapiary.call({ 'action': 'query', 'titles': name }) try: if c['query']['pages']['-1']: print "No duplicate name detected." except: # Duplicate detected name = "%s (Wikkii)" % name my_template = siteTemplate % (name, url, api_url, statistics_url) print my_template c = self.wikiapiary.call({ 'action': 'edit', 'title': name, 'text': my_template, 'token': self.my_token, 'bot': 'true', 'summary': 'Creating entry for %s' % name }) print c self.create_counter += 1 def checkSite(self, site): print "Checking %s" % site[1] # Construct Ask query for WikiApiary my_query = ''.join([ "[[Has statistics URL::%swiki/Special:Statistics]]" % site[1] ]) # Execute the query against WikiApiary c = self.wikiapiary.call({ 'action': 'ask', 'query': my_query }) # Return the count of results for the query return int(c['query']['meta']['count']) def main(self): # Get the list of tokens from the config file self.getList() for site in self.sites: # Limit the number of sites we make per run if self.create_counter > 1000: break print "\nProcessing %s" % site[0] # Use a guess of the API domain to see if we have it already siteCount = self.checkSite(site) if siteCount == 0: print "%s is not in WikiApiary, validating stats." % site[0] if self.validateStats(site[1]): # Now add it to WikiApiary self.createSite(site[0], site[1]) time.sleep(3) else: print "%s did not resolve to a valid API URL." % site[0] elif siteCount == 1: print "%s already exists, skipping." % site[0] elif siteCount > 1: print "ERROR: %s found %d websites, which should never happen." % (site[0], siteCount)
import re import sqlalchemy import solr from simplemediawiki import MediaWiki from editing import MusicBrainzClient import pprint import urllib import time from utils import mangle_name, join_names, quote_page_title import config as cfg engine = sqlalchemy.create_engine(cfg.MB_DB) db = engine.connect() db.execute("SET search_path TO musicbrainz, %s" % cfg.BOT_SCHEMA_DB) wp = MediaWiki('https://en.wikipedia.org/w/api.php') wps = solr.SolrConnection('http://localhost:8983/solr/wikipedia') mb = MusicBrainzClient(cfg.MB_USERNAME, cfg.MB_PASSWORD, cfg.MB_SITE) """ CREATE TABLE bot_wp_label ( gid uuid NOT NULL, processed timestamp with time zone DEFAULT now() ); ALTER TABLE ONLY bot_wp_label ADD CONSTRAINT bot_wp_label_pkey PRIMARY KEY (gid); """ query = """
class Wiki(object): def __init__(self, url, username, password): self.wiki = MediaWiki(url) self.username = username self.password = password self.login = self._make_wiki_login_call({'action': 'login'}) self.token = self._make_wiki_login_call({ 'action': 'login', 'lgtoken': self.login['login']['token'] }) def _make_wiki_login_call(self, packet): packet.update({'lgname': self.username, 'lgpassword': self.password}) response = self.wiki.call(packet) if DEBUG: print response return response def all_pages(self): response = self.wiki.call({'action': 'query', 'list': 'allpages'}) if DEBUG: print response assert_present('query', response) marker = 'foo' marker_name = 'foo' while marker: if 'query-continue' in response: for possible in ['apfrom', 'apcontinue']: if possible in response['query-continue']['allpages']: marker = \ response['query-continue']['allpages'][possible] marker_name = possible break else: marker = None for page in response['query']['allpages']: yield page['title'] response = self.wiki.call({ 'action': 'query', 'list': 'allpages', marker_name: marker }) if DEBUG: print response def get_page(self, title): response = self.wiki.call({ 'action': 'query', 'titles': title, 'prop': 'revisions', 'rvprop': 'content' }) if DEBUG: print response assert_present('query', response) pages = response['query']['pages'] page_id = pages.keys()[0] if not 'revisions' in pages[page_id]: # This is a new page return '' return pages[page_id]['revisions'][0]['*'] def check_for_page(self, title): response = self.wiki.call({ 'action': 'query', 'titles': title, 'prop': 'revisions', 'rvprop': 'content' }) if DEBUG: print response assert_present('query', response) pages = response['query']['pages'] page_id = pages.keys()[0] if not 'revisions' in pages[page_id]: return False return True def post_page(self, title, text, minor=True, bot=True): response = self.wiki.call({ 'action': 'query', 'prop': 'info', 'titles': title, 'intoken': 'edit' }) if DEBUG: print response assert_present('query', response) pages = response['query']['pages'] page_id = pages.keys()[0] response = self.wiki.call({ 'action': 'edit', 'minor': minor, 'bot': bot, 'title': title, 'text': json.dumps(text).replace('\\n', '\n')[1:-1], 'token': pages[page_id]['edittoken'] }) if DEBUG: print response if not 'nochange' in response['edit']: print 'Modified %s' % title def create_account(self, username, password, email, realname): response = self.wiki.call({ 'action': 'createaccount', 'name': username, 'password': password, 'email': email, 'realname': realname }) if DEBUG: print response response = self.wiki.call({ 'action': 'createaccount', 'name': username, 'password': password, 'email': email, 'realname': realname, 'token': response['createaccount']['token'] }) if DEBUG: print response return 'error' not in response
from simplemediawiki import MediaWiki from couchdb.client import Server server = Server() try: db = server.create('feedme') except: db = server['feedme'] wiki = MediaWiki('http://en.wikibooks.org/w/api.php') recipes = wiki.call({'action': 'query', 'list': 'categorymembers', 'cmtitle': 'Category:Recipes', 'cmlimit': 'max'}) for recipe in recipes['query']['categorymembers']: recipe_doc = recipe doc_id, doc_rev = db.save(recipe_doc) print "Added recipe %s (%s)" % (recipe_doc['title'], doc_id) #recipe_content = wiki.call({'action': 'parse', 'text': '{{%s}}' % recipe['title']}) #print recipe_content
import re import sqlalchemy import solr from simplemediawiki import MediaWiki from editing import MusicBrainzClient import pprint import urllib import time from utils import mangle_name, join_names, quote_page_title import config as cfg engine = sqlalchemy.create_engine(cfg.MB_DB) db = engine.connect() db.execute("SET search_path TO musicbrainz, %s" % cfg.BOT_SCHEMA_DB) wp = MediaWiki("https://en.wikipedia.org/w/api.php") wps = solr.SolrConnection("http://localhost:8983/solr/wikipedia") mb = MusicBrainzClient(cfg.MB_USERNAME, cfg.MB_PASSWORD, cfg.MB_SITE) """ CREATE TABLE bot_wp_label ( gid uuid NOT NULL, processed timestamp with time zone DEFAULT now() ); ALTER TABLE ONLY bot_wp_label ADD CONSTRAINT bot_wp_label_pkey PRIMARY KEY (gid); """
def get_single_lab(lab_slug, data_format): """Gets data from a single lab from makeinitaly.foundation.""" wiki = MediaWiki(makeinitaly__foundation_api_url) wiki_response = wiki.call( {'action': 'query', 'titles': lab_slug, 'prop': 'revisions', 'rvprop': 'content'}) # If we don't know the pageid... for i in wiki_response["query"]["pages"]: content = wiki_response["query"]["pages"][i]["revisions"][0]["*"] # Clean the resulting string/list newstr01 = content.replace("}}", "") newstr02 = newstr01.replace("{{", "") result = newstr02.rstrip("\n|").split("\n|") # result.remove(u'FabLab') # Transform the data into a Lab object current_lab = Lab() # Add existing data for i in result: if "coordinates=" in i: value = i.replace("coordinates=", "") current_lab.coordinates = value latlong = [] if ", " in value: latlong = value.rstrip(", ").split(", ") elif " , " in value: latlong = value.rstrip(" , ").split(" , ") else: latlong = ["", ""] current_lab.lat = latlong[0] current_lab.long = latlong[1] elif "province=" in i: value = i.replace("province=", "") current_lab.province = value.upper() elif "region=" in i: value = i.replace("region=", "") current_lab.region = value elif "address=" in i: value = i.replace("address=", "") current_lab.address = value elif "city=" in i: value = i.replace("city=", "") current_lab.city = value elif "fablabsio=" in i: value = i.replace("fablabsio=", "") current_lab.fablabsio = value elif "website=" in i: value = i.replace("website=", "") current_lab.website = value elif "facebook=" in i: value = i.replace("facebook=", "") current_lab.facebook = value elif "twitter=" in i: value = i.replace("twitter=", "") current_lab.twitter = value elif "email=" in i: value = i.replace("email=", "") current_lab.email = value elif "manager=" in i: value = i.replace("manager=", "") current_lab.manager = value elif "birthyear=" in i: value = i.replace("birthyear=", "") current_lab.birthyear = value current_lab.text_en = get_lab_text(lab_slug=lab_slug, language="en") current_lab.text_it = get_lab_text(lab_slug=lab_slug, language="it") if data_format == "dict": return current_lab.__dict__ elif data_format == "object": return current_lab
class smw_community: sites = [] # Blank reference to store mediawiki object in wikiapiary = {} smwreferata = {} # Edit token my_token = "" # Counter create_counter = 0 def __init__(self): config = ConfigParser.ConfigParser() config.read('../apiary.cfg') # Connect to SMW Community Wiki self.smwreferata = MediaWiki('http://smw.referata.com/w/api.php') # Connect to WikiApiary self.wikiapiary = MediaWiki(config.get('WikiApiary', 'api')) self.wikiapiary.login(config.get('wikkiibot', 'Username'), config.get('wikkiibot', 'Password')) # We need an edit token c = self.wikiapiary.call({ 'action': 'query', 'titles': 'Foo', 'prop': 'info', 'intoken': 'edit' }) self.my_token = c['query']['pages']['-1']['edittoken'] def load_from_smwreferata(self): # Build query for sites my_query = ''.join([ '[[Category:Sites]]', '[[Has status::Active]]', '|?Has URL', '|?Has data type' '|limit=1000' ]) print "Query: %s" % my_query sites = self.smwreferata.call({'action': 'ask', 'query': my_query}) # We could just return the raw JSON object from the API, however instead we are going to clean it up into an # easier to deal with array of dictionary objects. # To keep things sensible, we'll use the same name as the properties if len(sites['query']['results']) > 0: for pagename, site in sites['query']['results'].items(): print "Adding %s." % pagename self.sites.append({ 'Name': pagename, 'URL': site['printouts']['Has URL'][0], 'Tag': ','.join(site['printouts']['Has data type']) }) def add_api_to_sites(self): # Loop through the sites and find API urls for i in range(0, len(self.sites)): print "Investigating %s (%s)..." % (self.sites[i]['Name'], self.sites[i]['URL']) try: req = requests.get(self.sites[i]['URL']) if req.status_code == 200: soup = BeautifulSoup(req.text) api_url = soup.findAll('link', {'rel': 'EditURI'})[0]['href'] print " Found %s" % api_url new_api_url = urlparse.urlunparse( urlparse.urlparse(api_url)[0:3] + ('', '', '')) print " Resolved %s" % new_api_url self.sites[i]['API URL'] = new_api_url else: print " Returned %s" % req.status_code except Exception, e: print "Exception: %s" % e
|Active=No |Demote=No |Defunct=No }} [[Category:WikiTeam Import]]""" logo_page_text = """This image was automatically uploaded by [[User:Audit Bee|Audit Bee]] while importing. [[Category:Import logos]] """ # timeout in seconds timeout = 10 socket.setdefaulttimeout(timeout) wiki = MediaWiki( 'https://wikiapiary.com/w/api.php', cookie_file='cookie-jar', user_agent= 'python-simplemediawiki/1.1.1 (WikiApiary; Bumble Bee; +http://wikiapiary.com/wiki/User:Bumble_Bee)' ) wiki.login('Audit Bee', 'frYqj2AmPTqZDjn4TANE') # We need an edit token c = wiki.call({ 'action': 'query', 'titles': 'Foo', 'prop': 'info', 'intoken': 'edit' }) my_token = c['query']['pages']['-1']['edittoken'] i = 0 success = 0
class TropicalWikis: # Array to append sites to sites = [] # This file is a list of all the database names used by wikimedia # we can use this to try and derive the names of various wikis source_list = 'http://www.tropicalwikis.com/wiki/Special:Farmer/list' # Blank reference to store mediawiki object in wikiapiary = {} # Edit token my_token = "" # Counter create_counter = 0 def __init__(self): config = ConfigParser.ConfigParser() config.read('../apiary.cfg') self.wikiapiary = MediaWiki(config.get('WikiApiary', 'api')) self.wikiapiary.login(config.get('TropicalBot', 'Username'), config.get('TropicalBot', 'Password')) # We need an edit token on wiki2 c = self.wikiapiary.call({ 'action': 'query', 'titles': 'Foo', 'prop': 'info', 'intoken': 'edit' }) self.my_token = c['query']['pages']['-1']['edittoken'] def getList(self): soup = BeautifulSoup(requests.get(self.source_list).text) i = 1 for item in soup.findAll("a", {"class": "external text"}): site = (item.contents[0], item["href"]) print i, site self.sites.append(site) i += 1 def createSite(self, name, url): siteTemplate = """{{Website |Name=%s |URL=%s |Image=Default website image.png |Farm=TropicalWikis |Collection method=API |API URL=%s |Collect general data=Yes |Collect extension data=Yes |Collect skin data=Yes |Collect statistics=Yes |Collect semantic statistics=No |Collect semantic usage=No |Collect logs=No |Collect recent changes=No |Statistics URL= |Collect statistics stats=Yes |Check every=240 |Audited=No |Validated=No |Curated=No |Active=No |Demote=No |Defunct=No |Error=No |Featured website vote=0 }} """ api_url = "%s/w/api.php" % url # Make sure a page doesn't exist with this name already c = self.wikiapiary.call({'action': 'query', 'titles': name}) try: if c['query']['pages']['-1']: print "No duplicate name detected." except: # Duplicate detected name = "%s (TropicalWikis)" % name my_template = siteTemplate % (name, url, api_url) print my_template c = self.wikiapiary.call({ 'action': 'edit', 'title': name, 'text': my_template, 'token': self.my_token, 'bot': 'true', 'summary': 'Creating entry for %s' % name }) print c self.create_counter += 1 def checkSite(self, site): print "Checking %s" % site[1] # Construct Ask query for WikiApiary my_query = ''.join(["[[Has API URL::%s/w/api.php]]" % site[1]]) # Execute the query against WikiApiary c = self.wikiapiary.call({'action': 'ask', 'query': my_query}) # Return the count of results for the query return int(c['query']['meta']['count']) def main(self): # Get the list of tokens from the config file self.getList() for site in self.sites: print "\nProcessing %s" % site[0] # Use a guess of the API domain to see if we have it already siteCount = self.checkSite(site) if siteCount == 0: print "%s is not in WikiApiary, validating stats." % site[0] # Now add it to WikiApiary self.createSite(site[0], site[1]) time.sleep(3) elif siteCount == 1: print "%s already exists, skipping." % site[0] elif siteCount > 1: print "ERROR: %s found %d websites, which should never happen." % ( site[0], siteCount)
class Wikidata(object): """Talks to the Wikidata API to retrieve information about entities and claims.""" PROPERTY_COUNTRY = 'P17' PROPERTY_FLAG_IMAGE = 'P41' PROPERTY_APPLIES_TO_TERRITORIAL_JURISDICTION = 'P1001' def __init__(self): self.client = MediaWiki('https://www.wikidata.org/w/api.php') def get_entities_from_title(self, title, sites='enwiki'): """Return the entities matching the supplied title in a list. Arguments: title -- Name of the entity sites -- Wikidata site which should be searched for the title (default enwiki) Returns an empty list when no matching entity was found. """ params = {'action': 'wbgetentities', 'sites': sites, 'titles': title, 'props': ''} call = self.client.call(params) entities = call['entities'].keys() result = list() if entities[0] != -1: for entity in entities: result.append(entity) return result def get_claims_from_entity(self, entity, property=None): """Return the claims of the supplied entity. Arguments: entity -- Entity identifier property -- Filter to return only claims which has this property (default None) Returns a dictionary containing each claim. The value holds a list with the property values. Returns None when the entity was not found. """ params = {'action': 'wbgetclaims', 'entity': entity} if property is not None: params['property'] = property call = self.client.call(params) # If entity was not found or on an empty claims dictionary return if u'error' in call or not call['claims']: return None claims = call['claims'] result = dict() for property in claims: result[property] = list() values = claims[property] # multiple values are possible (see P31 on Q42) for value in values: result[property].append(value['mainsnak']['datavalue']) return result
def get_single_lab(lab_slug, open_cage_api_key): """Gets data from a single lab from hackerspaces.org.""" wiki = MediaWiki(hackerspaces_org_api_url) wiki_response = wiki.call({ 'action': 'query', 'titles': lab_slug, 'prop': 'revisions', 'rvprop': 'content' }) # If we don't know the pageid... for i in wiki_response["query"]["pages"]: content = wiki_response["query"]["pages"][i]["revisions"][0]["*"] # Transform the data into a Lab object current_lab = Hackerspace() equipment_list = [] # Parse the Mediawiki code wikicode = mwparserfromhell.parse(content) for k in wikicode.filter_templates(): element_name = unicode(k.name) if "Hackerspace" in element_name: for j in k.params: current_lab.name = lab_slug j_value = unicode(j.value) j_name = unicode(j.name) # Remove new line in content if j_value[-1:] == "\n" or j_value[:1] == "\n": j_value = j_value.replace('\n', '') if j_name == "logo": current_lab.logo = j_value if j_name == "founding": current_lab.founding = j_value if j_name == "coordinate": # Clean the coordinates j_value = j_value.replace('"', '') j_value = j_value.replace('N', '') j_value = j_value.replace('S', '') j_value = j_value.replace('W', '') j_value = j_value.replace('E', '') j_value = j_value.replace(u'°', '') j_value = j_value.replace(' ', '') # Get the full address with the coordinates address = get_location(query=j_value, format="reverse", api_key=open_cage_api_key) current_lab.city = address["city"] current_lab.county = address["county"] current_lab.state = address["state"] current_lab.postal_code = address["postal_code"] current_lab.address_1 = address["address_1"] current_lab.country = address["country"] current_lab.country_code = address["country_code"] current_lab.continent = address["continent"] current_lab.latitude = address["latitude"] current_lab.longitude = address["longitude"] if j_name == "membercount": current_lab.membercount = j_value if j_name == "fee": current_lab.fee = j_value if j_name == "size": current_lab.size = j_value if j_name == "status": current_lab.status = j_value if j_name == "site": current_lab.site = j_value if j_name == "wiki": current_lab.wiki = j_value if j_name == "irc": current_lab.irc = j_value if j_name == "jabber": current_lab.jabber = j_value if j_name == "phone": current_lab.phone = j_value if j_name == "youtube": current_lab.youtube = j_value if j_name == "eventbrite": current_lab.eventbrite = j_value if j_name == "facebook": current_lab.facebook = j_value if j_name == "ustream": current_lab.ustream = j_value if j_name == "flickr": current_lab.flickr = j_value if j_name == "twitter": current_lab.twitter = j_value if j_name == "googleplus": current_lab.googleplus = j_value if j_name == "email": current_lab.email = j_value if j_name == "maillist": current_lab.maillist = j_value if j_name == "ical": current_lab.ical = j_value if j_name == "forum": current_lab.forum = j_value elif "Equipment" in element_name: for j in k.params: equipment_list.append(j.replace("equipment=", "")) current_lab.equipment = equipment_list # Load the free text freetext = "" for k in wikicode._nodes: try: test_value = k.name except AttributeError: freetext += unicode(k) current_lab.text = freetext return current_lab
class WikimediaCommons(object): """Talks to the Wikimedia Commons API to retrieve information about an image hosted on Wikimedia Commons. """ def __init__(self): self.client = MediaWiki('https://commons.wikimedia.org/w/api.php') def __call_api(self, title, namespace='Image', thumbwidth=None): """Call the Commons API. Arguments: title -- Title of the page. namespace -- Namespace this title lies in (default 'Image') thumbwidth -- Width in pixels required for the thumbnail image URL (default None) Returns the API response or None when the title was not found. """ title = '{0}:{1}'.format(namespace, title) params = {'action': 'query', 'titles': title, 'prop': 'imageinfo', 'iiprop': 'url'} if thumbwidth is not None: params['iiurlwidth'] = thumbwidth result = self.client.call(params) pages = result['query']['pages'] id = pages.keys()[0] if id == -1: return None else: return pages[id] def get_image_url(self, name): """Retrieve the URL to the raw image. Arguments: name -- Image name Returns the image URL or None when the image was not found. """ result = self.__call_api(name) if result is None: return None image = result['imageinfo'][0] return image['url'] def get_thumb_image_url(self, name, width): """Retrieve the URL to the thumbnail image. Arguments: name -- Image name width -- Requested width in pixel Returns the thumbnail image URL or None when the image was not found. """ result = self.__call_api(name, thumbwidth=width) if result is None: return None image = result['imageinfo'][0] return image['thumburl']