def get_wikidata_sitelinks(source, target, titles):
    """
    Returns a dictionary mapping from titles to wikidata ids
    for the articles in source missing in target
    """
    endpoint = configuration.get_config_value('endpoints', 'wikidata')
    params = configuration.get_config_dict('wikidata_params')
    params['sites'] = params['sites'].format(source=source)
    params['titles'] = '|'.join(titles)

    title_id_dict = {}
    try:
        data = post(endpoint, data=params)
    except ValueError:
        log.info('Bad Wikidata API response')
        return title_id_dict

    source_wiki = '{}wiki'.format(source)
    target_wiki = '{}wiki'.format(target)

    if 'entities' not in data:
        log.info('None of the titles have a Wikidata Item')
        return title_id_dict

    for wikidata_id, v in data['entities'].items():
        sitelinks = v.get('sitelinks', None)
        if sitelinks:
            if source_wiki in sitelinks and target_wiki not in sitelinks:
                title = sitelinks[source_wiki]['title'].replace(' ', '_')
                title_id_dict[title] = wikidata_id

    if len(title_id_dict) == 0:
        log.info('None of the source articles missing in the target')

    return title_id_dict
示例#2
0
def get_wikidata_sitelinks(source, target, titles):
    """
    Returns a dictionary mapping from titles to wikidata ids
    for the articles in source missing in target
    """
    endpoint = configuration.get_config_value('endpoints', 'wikidata')
    params = configuration.get_config_dict('wikidata_params')
    params['sites'] = params['sites'].format(source=source)
    params['titles'] = '|'.join(titles)

    title_id_dict = {}
    try:
        data = post(endpoint, data=params)
    except ValueError:
        log.info('Bad Wikidata API response')
        return title_id_dict

    source_wiki = '{}wiki'.format(source)
    target_wiki = '{}wiki'.format(target)

    if 'entities' not in data:
        log.info('None of the titles have a Wikidata Item')
        return title_id_dict

    for wikidata_id, v in data['entities'].items():
        sitelinks = v.get('sitelinks', None)
        if sitelinks:
            if source_wiki in sitelinks and target_wiki not in sitelinks:
                title = sitelinks[source_wiki]['title'].replace(' ', '_')
                title_id_dict[title] = wikidata_id

    if len(title_id_dict) == 0:
        log.info('None of the source articles missing in the target')

    return title_id_dict
def build_wiki_search(source, seed, count, morelike):
    endpoint = configuration.get_config_value('endpoints', 'wikipedia').format(source=source)
    params = configuration.get_config_dict('wiki_search_params')
    params['srlimit'] = count
    if morelike:
        seed = 'morelike:' + seed
    params['srsearch'] = seed
    return endpoint, params
def get_disambiguation_pages(source, titles):
    """
    Returns the subset of titles that are disambiguation pages
    """
    endpoint = configuration.get_config_value('endpoints', 'wikipedia').format(source=source)
    params = configuration.get_config_dict('disambiguation_params')
    params['titles'] = '|'.join(titles)

    try:
        data = post(endpoint, data=params)
    except ValueError:
        log.info('Bad Disambiguation API response')
        return []

    pages = data.get('query', {}).get('pages', {}).values()
    return list(set(page['title'].replace(' ', '_') for page in pages if 'disambiguation' in page.get('pageprops', {})))
示例#5
0
def get_disambiguation_pages(source, titles):
    """
    Returns the subset of titles that are disambiguation pages
    """
    endpoint = configuration.get_config_value(
        'endpoints', 'wikipedia').format(source=source)
    params = configuration.get_config_dict('disambiguation_params')
    params['titles'] = '|'.join(titles)

    try:
        data = post(endpoint, data=params)
    except ValueError:
        log.info('Bad Disambiguation API response')
        return []

    pages = data.get('query', {}).get('pages', {}).values()
    return list(
        set(page['title'].replace(' ', '_') for page in pages
            if 'disambiguation' in page.get('pageprops', {})))
def get_category_members(source, category):
    log.debug(category)
    endpoint = configuration.get_config_value(
        'endpoints', 'wikipedia').format(source=source)
    params = configuration.get_config_dict('category_search_params')
    params['cmtitle'] = category

    members = dict(pages=set(), subcats=set())

    try:
        response = get(endpoint, params=params)
    except ValueError:
        return []
    results = response.get('query', {}).get('categorymembers', [])
    for member in results:
        if member.get('type', None) == 'page':
            members['pages'].add(member.get('title'))
        if member.get('type', None) == 'subcat':
            members['subcats'].add(member.get('title'))
    return members
def get_items(source,
              titles=None,
              ids=None,
              raw_filter=default_filter,
              props=None):
    params = configuration.get_config_dict('wikidata_query_params')
    if props:
        params['props'] = props
    params['sites'] = params['sites'].format(source=source)
    items = []
    if titles is not None:
        items = chunk_query_for_parameter(params, 'titles', titles)
    if ids is not None:
        items = chunk_query_for_parameter(params, 'ids', ids)
    items = [
        extract_from_raw(item, params['sites']) for item in items
        if raw_filter(item)
    ]
    items = [item for item in items if item is not None]
    return items