def get_wikidata_sitelinks(source, target, titles): """ Returns a dictionary mapping from titles to wikidata ids for the articles in source missing in target """ endpoint = configuration.get_config_value('endpoints', 'wikidata') params = configuration.get_config_dict('wikidata_params') params['sites'] = params['sites'].format(source=source) params['titles'] = '|'.join(titles) title_id_dict = {} try: data = post(endpoint, data=params) except ValueError: log.info('Bad Wikidata API response') return title_id_dict source_wiki = '{}wiki'.format(source) target_wiki = '{}wiki'.format(target) if 'entities' not in data: log.info('None of the titles have a Wikidata Item') return title_id_dict for wikidata_id, v in data['entities'].items(): sitelinks = v.get('sitelinks', None) if sitelinks: if source_wiki in sitelinks and target_wiki not in sitelinks: title = sitelinks[source_wiki]['title'].replace(' ', '_') title_id_dict[title] = wikidata_id if len(title_id_dict) == 0: log.info('None of the source articles missing in the target') return title_id_dict
def build_wiki_search(source, seed, count, morelike): endpoint = configuration.get_config_value('endpoints', 'wikipedia').format(source=source) params = configuration.get_config_dict('wiki_search_params') params['srlimit'] = count if morelike: seed = 'morelike:' + seed params['srsearch'] = seed return endpoint, params
def get_disambiguation_pages(source, titles): """ Returns the subset of titles that are disambiguation pages """ endpoint = configuration.get_config_value('endpoints', 'wikipedia').format(source=source) params = configuration.get_config_dict('disambiguation_params') params['titles'] = '|'.join(titles) try: data = post(endpoint, data=params) except ValueError: log.info('Bad Disambiguation API response') return [] pages = data.get('query', {}).get('pages', {}).values() return list(set(page['title'].replace(' ', '_') for page in pages if 'disambiguation' in page.get('pageprops', {})))
def get_disambiguation_pages(source, titles): """ Returns the subset of titles that are disambiguation pages """ endpoint = configuration.get_config_value( 'endpoints', 'wikipedia').format(source=source) params = configuration.get_config_dict('disambiguation_params') params['titles'] = '|'.join(titles) try: data = post(endpoint, data=params) except ValueError: log.info('Bad Disambiguation API response') return [] pages = data.get('query', {}).get('pages', {}).values() return list( set(page['title'].replace(' ', '_') for page in pages if 'disambiguation' in page.get('pageprops', {})))
def get_category_members(source, category): log.debug(category) endpoint = configuration.get_config_value( 'endpoints', 'wikipedia').format(source=source) params = configuration.get_config_dict('category_search_params') params['cmtitle'] = category members = dict(pages=set(), subcats=set()) try: response = get(endpoint, params=params) except ValueError: return [] results = response.get('query', {}).get('categorymembers', []) for member in results: if member.get('type', None) == 'page': members['pages'].add(member.get('title')) if member.get('type', None) == 'subcat': members['subcats'].add(member.get('title')) return members
def get_items(source, titles=None, ids=None, raw_filter=default_filter, props=None): params = configuration.get_config_dict('wikidata_query_params') if props: params['props'] = props params['sites'] = params['sites'].format(source=source) items = [] if titles is not None: items = chunk_query_for_parameter(params, 'titles', titles) if ids is not None: items = chunk_query_for_parameter(params, 'ids', ids) items = [ extract_from_raw(item, params['sites']) for item in items if raw_filter(item) ] items = [item for item in items if item is not None] return items