def main(): pages_with_viaf = set() for page in fetch_pages_with_viaf(): pages_with_viaf.add(page) artist_viaf = {} rows = sdb.execute("SELECT artist, url, viaf FROM viaf") for artist, url, viaf in rows: artist_viaf[artist] = { 'url': url, 'viaf': viaf, 'submitted': submitted } cnt = 0 for artist in db.execute(wp_url_query): if artist['id'] in artist_viaf: continue page = extract_page_title(artist['url'], wp_lang, normalize=True) if page not in pages_with_viaf: continue cnt += 1 viaf = fetch_viaf(page) print artist, viaf sdb.execute('INSERT INTO viaf (artist, url, viaf) VALUES (?, ?, ?)', (artist['id'], artist['url'], viaf)) sdb.commit() print cnt
def fetch(cls, url, use_cache=True): m = re.match(r'^http://([a-z]{2})\.wikipedia\.org', url) page_lang = m.group(1).encode('utf8') page_title = extract_page_title(url, page_lang) wp = MediaWiki('http://%s.wikipedia.org/w/api.php' % page_lang) return cls( page_title, get_page_content(wp, page_title, page_lang, use_cache) or '', page_lang)
def fetch(cls, url, use_cache=True): m = re.match(r'^http://([a-z\-]+)\.wikipedia\.org', url) page_lang = m.group(1).encode('utf8') page_title = extract_page_title(url, page_lang) wp = MediaWiki('http://%s.wikipedia.org/w/api.php' % page_lang) resp = wp.call({'action': 'query', 'prop': 'pageprops|revisions', 'titles': page_title.encode('utf8'), 'rvprop': 'content'}) page = resp['query']['pages'].values()[0] content = page['revisions'][0].values()[0] if 'revisions' in page else None if 'pageprops' in page and 'wikibase_item' in page['pageprops']: wikidata_id = page['pageprops']['wikibase_item'] else: wikidata_id = None return cls(page_title, content or '', page_lang, wikidata_id)
def fetch(cls, url, use_cache=True): m = re.match(r'^http://([a-z\-]+)\.wikipedia\.org', url) page_lang = m.group(1).encode('utf8') page_title = extract_page_title(url, page_lang) wp = MediaWiki('http://%s.wikipedia.org/w/api.php' % page_lang) resp = wp.call({ 'action': 'query', 'prop': 'pageprops|revisions', 'titles': page_title.encode('utf8'), 'rvprop': 'content' }) page = resp['query']['pages'].values()[0] content = page['revisions'][0].values( )[0] if 'revisions' in page else None if 'pageprops' in page and 'wikibase_item' in page['pageprops']: wikidata_id = page['pageprops']['wikibase_item'] else: wikidata_id = None return cls(page_title, content or '', page_lang, wikidata_id)
def main(): pages_with_viaf = set() for page in fetch_pages_with_viaf(): pages_with_viaf.add(page) artist_viaf = {} rows = sdb.execute("SELECT artist, url, viaf FROM viaf") for artist, url, viaf in rows: artist_viaf[artist] = {"url": url, "viaf": viaf, "submitted": submitted} cnt = 0 for artist in db.execute(wp_url_query): if artist["id"] in artist_viaf: continue page = extract_page_title(artist["url"], wp_lang, normalize=True) if page not in pages_with_viaf: continue cnt += 1 viaf = fetch_viaf(page) print artist, viaf sdb.execute("INSERT INTO viaf (artist, url, viaf) VALUES (?, ?, ?)", (artist["id"], artist["url"], viaf)) sdb.commit() print cnt
def fetch(cls, url, use_cache=True): m = re.match(r'^http://([a-z]{2})\.wikipedia\.org', url) page_lang = m.group(1).encode('utf8') page_title = extract_page_title(url, page_lang) wp = MediaWiki('http://%s.wikipedia.org/w/api.php' % page_lang) return cls(page_title, get_page_content(wp, page_title, page_lang, use_cache) or '', page_lang)