예제 #1
0
def call_api(action, cache=True, **kwargs):
    """ Invoke the given method of wikidata APIs with the given parameters
    """
    kwargs['format'] = 'json'
    kwargs['action'] = action
    resp = io.get_and_cache(WIKIDATA_API_URL, use_cache=cache, params=kwargs)
    return json.loads(resp)
예제 #2
0
def call_api(action, cache=True, **kwargs):
    """ Invoke the given method of wikidata APIs with the given parameters
    """
    kwargs['format'] = 'json'
    kwargs['action'] = action
    resp = io.get_and_cache(WIKIDATA_API_URL, use_cache=cache, params=kwargs)
    return json.loads(resp)
예제 #3
0
def main(out_file, use_cache, step):
    finished = False
    offset = 0
    person = {}
    while not finished:
        print 'processed %d records' % offset
        url = query_url.format(limit=step, offset=offset)
        result = json.loads(get_and_cache(url, use_cache))
        offset += step
        finished = len(result['results']['bindings']) < step

        # adjacent records can refer to the same person, so group all properties in sets
        for i, data in enumerate(result['results']['bindings']):
            url = data.pop('person')['value']

            if person and person['url'] != url:
                out_file.write(serialize_person(person))
                out_file.write('\n')
                person = {}

            person['url'] = url
            for key, value in data.iteritems():
                value = value['value']
                if key == 'gender':
                    value = value.split('/')[-1]

                if key in person:
                    person[key].add(value)
                else:
                    person[key] = {value}
예제 #4
0
def who_was_who(ctx):
    out_file = ctx.obj.pop('out_file')
    use_cache = ctx.obj.pop('cache')

    url = 'https://archive.org/download/whowaswhocompani01londuoft/' \
          'whowaswhocompani01londuoft_djvu.txt'
    text = get_and_cache(url, use_cache)
    parse_and_save(text, ur'([A-Z]+, ([0-9A-Z][. \-a-z]+)+),[^,]+[,;]', out_file, url)
예제 #5
0
def who_was_who(ctx):
    out_file = ctx.obj.pop('out_file')
    use_cache = ctx.obj.pop('cache')

    url = 'https://archive.org/download/whowaswhocompani01londuoft/' \
          'whowaswhocompani01londuoft_djvu.txt'
    text = get_and_cache(url, use_cache)
    parse_and_save(text, ur'([A-Z]+, ([0-9A-Z][. \-a-z]+)+),[^,]+[,;]',
                   out_file, url)
예제 #6
0
def american_bio(ctx):
    out_file = ctx.obj.pop('out_file')
    use_cache = ctx.obj.pop('cache')

    for volume in xrange(1, 11):
        print 'Volume', volume
        volume_url = 'https://archive.org/download/biographicaldict{volume:02d}johnuoft/' \
                     'biographicaldict{volume:02d}johnuoft_djvu.txt'.format(volume=volume)
        vol = get_and_cache(volume_url, use_cache)
        parse_and_save(vol, ur'([A-Z]+, [A-Z][a-z]+ ?),[^,]+,', out_file, volume_url)
예제 #7
0
def american_bio(ctx):
    out_file = ctx.obj.pop('out_file')
    use_cache = ctx.obj.pop('cache')

    for volume in xrange(1, 11):
        print 'Volume', volume
        volume_url = 'https://archive.org/download/biographicaldict{volume:02d}johnuoft/' \
                     'biographicaldict{volume:02d}johnuoft_djvu.txt'.format(volume=volume)
        vol = get_and_cache(volume_url, use_cache)
        parse_and_save(vol, ur'([A-Z]+, [A-Z][a-z]+ ?),[^,]+,', out_file,
                       volume_url)
예제 #8
0
def wikidata_id_from_wikipedia_url(wiki_url):
    title = urlparse(wiki_url).path[len('/wiki/'):]
    data = json.loads(io.get_and_cache(
        'https://en.wikipedia.org/w/api.php?action=query&prop=pageprops&format=json&titles=' + title
    ))

    ids = [
        page['pageprops']['wikibase_item']
        for pid, page in data['query']['pages'].iteritems()
        if pid >= 0 and 'wikibase_item' in page.get('pageprops', {})
    ]

    if not ids:
        logger.debug('failed to reconcile uri %s with a wikidata page')
        return None
    elif len(ids) > 1:
        logger.debug('uri %s was reconciled to items %s, picking the first one',
                     ', '.join(ids))

    return ids[0]
예제 #9
0
def wikidata_id_from_wikipedia_url(wiki_url):
    title = urlparse(wiki_url).path[len('/wiki/'):]
    data = json.loads(
        io.get_and_cache(
            'https://en.wikipedia.org/w/api.php?action=query&prop=pageprops&format=json&titles='
            + title))

    ids = [
        page['pageprops']['wikibase_item']
        for pid, page in data['query']['pages'].iteritems()
        if pid >= 0 and 'wikibase_item' in page.get('pageprops', {})
    ]

    if not ids:
        logger.debug('failed to reconcile uri %s with a wikidata page')
        return None
    elif len(ids) > 1:
        logger.debug(
            'uri %s was reconciled to items %s, picking the first one',
            ', '.join(ids))

    return ids[0]