def wikidata_entity_request(
    qids: List[str],
    language_keys: Optional[List[str]] = lang_keys,
    props: Optional[List[str]] = [
        CLAIMS,
        DESCRIPTION[PLURAL],
        LABEL[PLURAL],
        SITELINKS,
    ],
    timeout: Optional[int] = TIMEOUT,
    sleep_time: Optional[int] = SLEEP_TIME,
    maxlag: Optional[int] = MAX_LAG,
) -> Dict:
    """Represents an wikidata entity request for a list of qids
    The API specifies that 50 items can be loaded at once without needing additional permissions:
    https://www.wikidata.org/w/api.php?action=help&modules=wbgetentities

    Args:
        qids: List of qids
        language_keys: All language keys which should be extracted. Defaults to languageconfig.csv
        props: Properties of the entity request. Defaults to [CLAIMS, DESCRIPTION[PLURAL], LABEL[PLURAL], SITELINKS]
        timeout: Timeout for the queries. Defaults to TIMEOUT
        sleep_time: Sleep time if errors occur. Defaults to SLEEP_TIME
        maxlag: Maxlag for the wikidata server see https://www.mediawiki.org/wiki/Manual:Maxlag_parameter. Defaults to MAX_LAG

    Returns:
        Raw wikidata response for the requested entities

    Examples:
        wikidata_entity_request(["Q12418", "Q45585"])
    """
    initial_timeout = timeout
    langkeyPlusWikiList = [key + "wiki" for key in language_keys]
    parameters = {
        "action": "wbgetentities",
        "ids": "|".join(qids),
        "format": JSON,
        "languages": "|".join(language_keys),
        "sitefilter": "|".join(langkeyPlusWikiList),
        "props": "|".join(props),
        "redirects": "no",
        # if the server needs more than maxlag seconds to process
        # the query an error response is returned
        "maxlag": maxlag,
    }

    url = WIKIDATA_API_URL
    return send_http_request(
        parameters,
        HTTP_HEADER,
        url,
        logger,
        initial_timeout=initial_timeout,
        items=qids,
        timeout=timeout,
        sleep_time=sleep_time,
        maxlag=maxlag,
    )
Exemplo n.º 2
0
def get_wikipedia_page_ids(
        items: List[Dict],
        indices: List[int],
        langkey: str,
        timeout: Optional[int] = TIMEOUT,
        sleep_time: Optional[int] = SLEEP_TIME,
        maxlag: Optional[int] = MAX_LAG,
) -> Dict:
    """Function to get the wikipedia page ids from their label referenced in the sitelinks

    https://en.wikipedia.org/w/api.php?action=help&modules=query
    sitelink de: Mona_Lisa is resolved to

    Args:
        items: List of items
        indices: A list of indices which contain a sitelink
        langkey: A specific language key e. g. 'en'
        timeout: Timeout on the request. Defaults to TIMEOUT.
        sleep_time: Waiting time if there are serverside problems. Defaults to SLEEP_TIME.
        maxlag: Maxlag for the wikidata server see https://www.mediawiki.org/wiki/Manual:Maxlag_parameter. Defaults to MAX_LAG

    Returns:
        A dictionary which maps the wikipedia page id (which is not a qid like in wikidata) to an index in the items dictionary

    Source:
        https://stackoverflow.com/questions/52787504/how-to-get-page-id-from-wikipedia-page-title
    """
    title_indice_dictionary = {}
    wikipedia_url = f"https://{langkey}.wikipedia.org/wiki/"
    for index in indices:
        title_indice_dictionary.update(
            {
                items[index][f"{WIKIPEDIA_LINK}_{langkey}"].replace(
                    wikipedia_url, ""
                ): index
            }
        )

    parameters = {
        "action": "query",
        "format": JSON,
        "prop": "info",
        "titles": "|".join(title_indice_dictionary.keys()),
        # if the server needs more than maxlag seconds to answer
        # the query an error response is returned
        "maxlag": maxlag,
    }

    url = f"https://{langkey}.wikipedia.org/w/api.php"
    response = send_http_request(
        parameters,
        HTTP_HEADER,
        url,
        logger,
        items=title_indice_dictionary.keys(),
        timeout=TIMEOUT,
        sleep_time=SLEEP_TIME,
        maxlag=MAX_LAG,
    )

    page_normalized_titles = {x: x for x in title_indice_dictionary.keys()}

    # map index of json array to page id of wikipedia
    item_page_id_index_dictionary = {}
    if "normalized" in response["query"]:
        for mapping in response["query"]["normalized"]:
            page_normalized_titles[mapping["to"]] = mapping["from"]

    for page_id, page_info in response["query"]["pages"].items():
        normalized_title = page_info["title"]
        page_title = page_normalized_titles[normalized_title]
        index = title_indice_dictionary[page_title]
        item_page_id_index_dictionary[page_id] = index

    return item_page_id_index_dictionary
Exemplo n.º 3
0
def get_wikipedia_extracts(
        items: List[Dict],
        page_id_index_dictionary: Dict,
        langkey: str,
        timeout: Optional[int] = TIMEOUT,
        sleep_time: Optional[int] = SLEEP_TIME,
        maxlag: Optional[int] = MAX_LAG,
):
    """Get the wikipedia extracts (in our data model they're called abstracts)

    https://en.wikipedia.org/w/api.php?action=help&modules=query

    Args:
        items: List of entities
        page_id_index_dictionary: Dictionary to resolve the page ids from the indices in the items list
        langkey: A specific language key e. g. 'en'
        timeout: Timeout on the request. Defaults to TIMEOUT.
        sleep_time: Waiting time if there are serverside problems. Defaults to SLEEP_TIME.
        maxlag: Maxlag for the wikidata server see https://www.mediawiki.org/wiki/Manual:Maxlag_parameter. Defaults to MAX_LAG

    Returns:
        A dictionary with index and abstract which is added to the entity of the index later

    Source:
            https://en.wikipedia.org/w/api.php?format=json&action=query&prop=extracts&exintro&explaintext&pageids=70889|1115370
    """
    parameters = {
        "action": "query",
        "format": JSON,
        "prop": "extracts",
        "exintro": True,
        "explaintext": True,
        "pageids": "|".join(page_id_index_dictionary.keys()),
        # if the server needs more than maxlag seconds to answer
        # the query an error response is returned
        "maxlag": maxlag,
    }

    # Send HTTP-Request
    url = f"https://{langkey}.wikipedia.org/w/api.php"
    response = send_http_request(
        parameters,
        HTTP_HEADER,
        url,
        logger,
        items=page_id_index_dictionary.keys(),
        abstracts=True,
        timeout=TIMEOUT,
        sleep_time=SLEEP_TIME,
        maxlag=MAX_LAG,
    )

    index_extract_dictionary = {}
    for page_id, index in page_id_index_dictionary.items():
        if int(page_id) < 0:
            print(
                "For the wikidata item {0} there was no pageid found on the {1}.wikipedia site. Therefore the extract is set to an empty string now".format(
                    items[index]["id"], langkey
                )
            )
            # Return empty extract for those cases
            index_extract_dictionary[index] = ""
            continue
        index_extract_dictionary[index] = response["query"]["pages"][page_id]["extract"]
    return index_extract_dictionary