def call_api(action, cache=True, **kwargs): """ Invoke the given method of wikidata APIs with the given parameters """ kwargs['format'] = 'json' kwargs['action'] = action resp = io.get_and_cache(WIKIDATA_API_URL, use_cache=cache, params=kwargs) return json.loads(resp)
def main(out_file, use_cache, step): finished = False offset = 0 person = {} while not finished: print 'processed %d records' % offset url = query_url.format(limit=step, offset=offset) result = json.loads(get_and_cache(url, use_cache)) offset += step finished = len(result['results']['bindings']) < step # adjacent records can refer to the same person, so group all properties in sets for i, data in enumerate(result['results']['bindings']): url = data.pop('person')['value'] if person and person['url'] != url: out_file.write(serialize_person(person)) out_file.write('\n') person = {} person['url'] = url for key, value in data.iteritems(): value = value['value'] if key == 'gender': value = value.split('/')[-1] if key in person: person[key].add(value) else: person[key] = {value}
def who_was_who(ctx): out_file = ctx.obj.pop('out_file') use_cache = ctx.obj.pop('cache') url = 'https://archive.org/download/whowaswhocompani01londuoft/' \ 'whowaswhocompani01londuoft_djvu.txt' text = get_and_cache(url, use_cache) parse_and_save(text, ur'([A-Z]+, ([0-9A-Z][. \-a-z]+)+),[^,]+[,;]', out_file, url)
def american_bio(ctx): out_file = ctx.obj.pop('out_file') use_cache = ctx.obj.pop('cache') for volume in xrange(1, 11): print 'Volume', volume volume_url = 'https://archive.org/download/biographicaldict{volume:02d}johnuoft/' \ 'biographicaldict{volume:02d}johnuoft_djvu.txt'.format(volume=volume) vol = get_and_cache(volume_url, use_cache) parse_and_save(vol, ur'([A-Z]+, [A-Z][a-z]+ ?),[^,]+,', out_file, volume_url)
def wikidata_id_from_wikipedia_url(wiki_url): title = urlparse(wiki_url).path[len('/wiki/'):] data = json.loads(io.get_and_cache( 'https://en.wikipedia.org/w/api.php?action=query&prop=pageprops&format=json&titles=' + title )) ids = [ page['pageprops']['wikibase_item'] for pid, page in data['query']['pages'].iteritems() if pid >= 0 and 'wikibase_item' in page.get('pageprops', {}) ] if not ids: logger.debug('failed to reconcile uri %s with a wikidata page') return None elif len(ids) > 1: logger.debug('uri %s was reconciled to items %s, picking the first one', ', '.join(ids)) return ids[0]
def wikidata_id_from_wikipedia_url(wiki_url): title = urlparse(wiki_url).path[len('/wiki/'):] data = json.loads( io.get_and_cache( 'https://en.wikipedia.org/w/api.php?action=query&prop=pageprops&format=json&titles=' + title)) ids = [ page['pageprops']['wikibase_item'] for pid, page in data['query']['pages'].iteritems() if pid >= 0 and 'wikibase_item' in page.get('pageprops', {}) ] if not ids: logger.debug('failed to reconcile uri %s with a wikidata page') return None elif len(ids) > 1: logger.debug( 'uri %s was reconciled to items %s, picking the first one', ', '.join(ids)) return ids[0]