Exemplo n.º 1
0
def try_map_response_to_subject(
    response: Dict,
    type_name: str,
    language_keys: Optional[List[str]] = lang_keys,
) -> Dict:
    """Maps the default attributes which every subject has:
    qid, image, label, description, classes, wikipediaLink (including language specific attributes)

    Args:
        response: The wikidata entity which should be mapped to an openArtBrowser entity
        type_name: Type name of the entity
        language_keys: All language keys which should be extracted. Defaults to languageconfig.csv

    Returns:
        A dict of an openArtBrowser entity
    """
    try:
        qid = response[ID]
    except Exception as error:
        logger.error("Error on qid, skipping item. Error: {0}".format(error))
        return None

    # How to get image url
    # https://stackoverflow.com/questions/34393884/how-to-get-image-url-property-from-wikidata-item-by-api
    try:
        image = get_image_url_by_name(
            response[CLAIMS][PROPERTY_NAME_TO_PROPERTY_ID[IMAGE]][0][MAINSNAK]
            [DATAVALUE][VALUE])
    except:
        image = ""

    label = map_wd_attribute.try_get_label_or_description(
        response, LABEL[PLURAL], EN, type_name)
    description = map_wd_attribute.try_get_label_or_description(
        response, DESCRIPTION[PLURAL], EN, type_name)
    classes = map_wd_attribute.try_get_qid_reference_list(
        response, PROPERTY_NAME_TO_PROPERTY_ID[CLASS[SINGULAR]], type_name)

    subject_dict = {
        ID: qid,
        CLASS[PLURAL]: classes,
        LABEL[SINGULAR]: label,
        DESCRIPTION[SINGULAR]: description,
        IMAGE: image,
    }

    for langkey in language_keys:
        label_lang = map_wd_attribute.try_get_label_or_description(
            response, LABEL[PLURAL], langkey, type_name)
        description_lang = map_wd_attribute.try_get_label_or_description(
            response, DESCRIPTION[PLURAL], langkey, type_name)
        wikipedia_link_lang = map_wd_attribute.try_get_wikipedia_link(
            response, langkey, type_name)
        subject_dict.update({
            f"{LABEL[SINGULAR]}_{langkey}": label_lang,
            f"{DESCRIPTION[SINGULAR]}_{langkey}": description_lang,
            f"{WIKIPEDIA_LINK}_{langkey}": wikipedia_link_lang,
        })

    return subject_dict
def get_exhibition_entities(
    qids: Set[str],
    language_keys: Optional[List[str]] = lang_keys,
    type_name: str = EXHIBITION,
) -> Dict[str, Dict]:
    """Function to get the exhibition entities from wikidata

    Args:
        qids: Distinct qid set to get the entities from
        language_keys: Language keys to extract label and description from. Defaults to languageconfig.csv
        type_name: OAB type name. Defaults to EXHIBITION.

    Returns:
        A dict with the qids as key and the JSON object as value
    """
    print(datetime.datetime.now(), f"Starting with exhibition entities")
    print(f"Total exhibition entities to extract: {len(qids)}")
    item_count = 0
    extract_dicts = {}
    chunk_size = 50  # The chunksize 50 is allowed by the wikidata api, bigger numbers need special permissions
    id_chunks = chunks(list(qids), chunk_size)
    for chunk in id_chunks:
        query_result = wikidata_entity_request(chunk)
        for result in query_result[ENTITIES].values():
            try:
                qid = result[ID]
            except Exception as error:
                logger.error(
                    "Error on qid, skipping item. Error: {0}".format(error))
                continue
            label = map_wd_attribute.try_get_label_or_description(
                result, LABEL[PLURAL], EN, type_name)
            description = map_wd_attribute.try_get_label_or_description(
                result, DESCRIPTION[PLURAL], EN, type_name)
            start_time = map_wd_attribute.try_get_year_from_property_timestamp(
                result, PROPERTY_NAME_TO_PROPERTY_ID[START_TIME], type_name)
            end_time = map_wd_attribute.try_get_year_from_property_timestamp(
                result, PROPERTY_NAME_TO_PROPERTY_ID[END_TIME], type_name)

            extract_dicts.update({
                qid: {
                    LABEL[SINGULAR]: label,
                    DESCRIPTION[SINGULAR]: description,
                    START_TIME: start_time,
                    END_TIME: end_time,
                    TYPE: EXHIBITION,
                }
            })

            for langkey in language_keys:
                label_lang = map_wd_attribute.try_get_label_or_description(
                    result, LABEL[PLURAL], langkey, type_name)
                description_lang = map_wd_attribute.try_get_label_or_description(
                    result, DESCRIPTION[PLURAL], langkey, type_name)
                extract_dicts[qid][f"{LABEL[SINGULAR]}_{langkey}"] = label_lang
                extract_dicts[qid][
                    f"{DESCRIPTION[SINGULAR]}_{langkey}"] = description_lang

        item_count += len(chunk)
        print(
            f"Status of exhibition entities: {item_count}/{len(qids)}",
            end="\r",
            flush=True,
        )

    print(datetime.datetime.now(), f"Finished with exhibition entities")
    return extract_dicts
def get_entity_labels(
    type_name: str,
    qids: List[str],
    language_keys: Optional[List[str]] = lang_keys,
) -> List[Dict]:
    """Function to get the entity labels from wikidata

    Args:
        type_name: oab type e. g. movement
        qids: List of qids to extract the labels from
        language_keys: All language keys which should be extracted. Defaults to languageconfig.csv

    Returns:
        List of dicts containing the qid and the labels for each language
    """
    print(datetime.datetime.now(),
          f"Starting with {type_name} {LABEL[PLURAL]}")
    print(f"Total {type_name} {LABEL[PLURAL]} to extract: {len(qids)}")
    item_count = 0
    extract_dicts = []
    chunk_size = 50  # The chunksize 50 is allowed by the wikidata api, bigger numbers need special permissions
    id_chunks = chunks(list(qids), chunk_size)
    for chunk in id_chunks:
        query_result = wikidata_entity_request(
            chunk, props=[LABEL[PLURAL]],
            timeout=10)  # country entities take longer so timeout is increased

        if ENTITIES not in query_result:
            logger.error("Skipping chunk")
            continue

        for result in query_result[ENTITIES].values():
            try:
                qid = result[ID]
            except Exception as error:
                logger.error(
                    "Error on qid, skipping item. Error: {0}".format(error))
                continue

            label = map_wd_attribute.try_get_label_or_description(
                result, LABEL[PLURAL], EN, type_name)
            subject_dict = {
                ID: qid,
                LABEL[SINGULAR]: label,
            }

            for langkey in language_keys:
                label_lang = map_wd_attribute.try_get_label_or_description(
                    result, LABEL[PLURAL], langkey, type_name)
                subject_dict.update(
                    {f"{LABEL[SINGULAR]}_{langkey}": label_lang})
            extract_dicts.append(subject_dict)

        item_count += len(chunk)
        print(
            f"Status of {type_name} {LABEL[PLURAL]}: {item_count}/{len(qids)}",
            end="\r",
            flush=True,
        )

    print(datetime.datetime.now(),
          f"Finished with {type_name} {LABEL[PLURAL]}")
    return extract_dicts
def get_classes(
    type_name: str,
    qids: List[str],
    already_extracted_superclass_ids: Set[str] = set(),
    language_keys: Optional[List[str]] = lang_keys,
) -> List[Dict]:
    """Function to extract the classes of the extracted wikidata entities (meaning the 'instance of' attribute wikidata entity qids).
    Their subclasses are also extracted recursively (also called transitive closure)

    Args:
        type_name: oab type e. g. movement
        qids: List of qids to extract the labels from
        language_keys: All language keys which should be extracted. Defaults to languageconfig.csv
        already_extracted_superclass_ids: A list of already extracted superclass ids for the recursive calls,
        this is also the anchor to stop recursion

    Returns:
        Returns a list of dicts with the classes from the oab entities and their subclasses
    """
    print(datetime.datetime.now(), f"Starting with {type_name}")
    if type_name == CLASS[PLURAL]:
        print(
            f"Total {type_name} to extract (only 'instance_of' of the provided qids): {len(qids)}"
        )
    else:
        print(
            f"Total {type_name} to extract (only 'subclass_of' of the provided qids): {len(qids)}"
        )
    item_count = 0
    extract_dicts = []
    chunk_size = 50  # The chunksize 50 is allowed by the wikidata api, bigger numbers need special permissions
    classes_id_chunks = chunks(list(qids), chunk_size)
    for chunk in classes_id_chunks:
        query_result = wikidata_entity_request(chunk)

        if ENTITIES not in query_result:
            logger.error("Skipping chunk")
            continue

        for result in query_result[ENTITIES].values():
            try:
                qid = result[ID]
            except Exception as error:
                logger.error(
                    "Error on qid, skipping item. Error: {0}".format(error))
                continue
            label = map_wd_attribute.try_get_label_or_description(
                result, LABEL[PLURAL], EN, type_name)
            description = map_wd_attribute.try_get_label_or_description(
                result, DESCRIPTION[PLURAL], EN, type_name)
            subclass_of = map_wd_attribute.try_get_qid_reference_list(
                result, PROPERTY_NAME_TO_PROPERTY_ID[SUBCLASS_OF], type_name)
            class_dict = {
                ID: qid,
                LABEL[SINGULAR]: label,
                DESCRIPTION[SINGULAR]: description,
                SUBCLASS_OF: subclass_of,
            }

            for langkey in language_keys:
                label_lang = map_wd_attribute.try_get_label_or_description(
                    result, LABEL[PLURAL], langkey, type_name)
                description_lang = map_wd_attribute.try_get_label_or_description(
                    result, DESCRIPTION[PLURAL], langkey, type_name)
                class_dict.update({
                    f"{LABEL[SINGULAR]}_{langkey}":
                    label_lang,
                    f"{DESCRIPTION[SINGULAR]}_{langkey}":
                    description_lang,
                })
            extract_dicts.append(class_dict)

        item_count += len(chunk)
        print(f"Status of {type_name}: {item_count}/{len(qids)}",
              end="\r",
              flush=True)

    return load_entities_by_attribute_with_transitive_closure(
        extract_dicts,
        SUBCLASS_OF,
        CLASS[PLURAL],
        already_extracted_superclass_ids,
        get_classes,
        [],
    )
def extract_artworks(
    type_name: str,
    wikidata_id: str,
    already_crawled_wikidata_items: Set,
    dev_mode: bool,
    dev_chunk_limit: int,
    language_keys: Optional[List[str]] = lang_keys,
) -> List[Dict]:
    """Extracts artworks metadata from Wikidata and stores them in a dictionary.

    Args:
        type_name: Type name of an artwork e. g. 'drawings'. Important for console output
        wikidata_id: Wikidata Id of a class; all instances of this class and all subclasses with image will be loaded. See artworks_ids_query.sparql
        already_crawled_wikidata_items: Set of all already crawled artwork items. Because the types have common items it is necessary to avoid loading items multiple times
        language_keys: All language keys which should be extracted. Defaults to languageconfig.csv
        dev_mode: To reduce the number of loaded chunks set this to true
        dev_chunk_limit: Limit of chunks per category
    Returns:
        A list with all artwork entity dicts (or JSON-objects) which are transformed for the OAB

    Examples:
        extract_artworks('drawings', 'wd:Q93184', '('en', 'de'))
        extract_artworks('sculptures', 'wd:Q860861', '('en', 'de'))
        extract_artworks('paintings', 'wd:Q3305213', '('en', 'de'))
    """
    print(datetime.datetime.now(), "Starting with", type_name)

    extract_dicts = []
    chunk_count = 0
    item_count = 0
    artwork_ids = query_artwork_qids(type_name, wikidata_id)

    # Don't load items again, if they were loaded in another artwork category
    for artwork_id in artwork_ids:
        if artwork_id in already_crawled_wikidata_items:
            artwork_ids.remove(artwork_id)

    print(
        f"{len(artwork_ids)} {type_name} entries are not loaded yet, starting now. Already crawled item count is {len(already_crawled_wikidata_items)}"
    )
    chunk_size = 50  # The chunksize 50 is allowed by the wikidata api, bigger numbers need special permissions
    artwork_id_chunks = chunks(artwork_ids, chunk_size)
    for chunk in artwork_id_chunks:
        if dev_mode and chunk_count == dev_chunk_limit:
            logger.info(
                f"DEV_CHUNK_LIMIT of {type_name} reached. End extraction for {type_name}"
            )
            break

        query_result = wikidata_entity_request(chunk)
        if ENTITIES not in query_result:
            logger.error("Skipping chunk")
            continue

        for result in query_result[ENTITIES].values():
            try:
                qid = result[ID]
                image = map_wd_attribute.get_image_url_by_name(
                    result[CLAIMS][PROPERTY_NAME_TO_PROPERTY_ID[IMAGE]][0]
                    [MAINSNAK][DATAVALUE][VALUE])
            except Exception as error:
                logger.error(
                    "Error on qid or image, skipping item. Qid: {0}, Image: {1}, Error: {2}"
                    .format(qid, image, error))
                continue

            label = map_wd_attribute.try_get_label_or_description(
                result, LABEL[PLURAL], EN, type_name)
            description = map_wd_attribute.try_get_label_or_description(
                result, DESCRIPTION[PLURAL], EN, type_name)

            (
                classes,
                artists,
                locations,
                genres,
                movements,
                materials,
                motifs,
                main_subjects,
                exhibition_history,
            ) = map_wd_attribute.get_attribute_values_with_try_get_func(
                result,
                [
                    CLASS[SINGULAR],
                    ARTIST[SINGULAR],
                    LOCATION[SINGULAR],
                    GENRE[SINGULAR],
                    MOVEMENT[SINGULAR],
                    MATERIAL[SINGULAR],
                    MOTIF[SINGULAR],
                    MAIN_SUBJECT[SINGULAR],
                    EXHIBITION_HISTORY,
                ],
                type_name,
                map_wd_attribute.try_get_qid_reference_list,
            )

            iconclasses = map_wd_attribute.try_get_value_list(
                result, PROPERTY_NAME_TO_PROPERTY_ID[ICONCLASS[SINGULAR]],
                type_name)
            inception = map_wd_attribute.try_get_year_from_property_timestamp(
                result, PROPERTY_NAME_TO_PROPERTY_ID[INCEPTION], type_name)
            country = map_wd_attribute.try_get_first_qid(
                result, PROPERTY_NAME_TO_PROPERTY_ID[COUNTRY], type_name)

            # Resolve dimensions
            # The units are qids which have to be resolved later
            (
                height,
                width,
                length,
                diameter,
            ) = map_wd_attribute.get_attribute_values_with_try_get_func(
                result,
                [HEIGHT, WIDTH, LENGTH, DIAMETER],
                type_name,
                map_wd_attribute.try_get_dimension_value,
            )
            (
                height_unit,
                width_unit,
                length_unit,
                diameter_unit,
            ) = map_wd_attribute.get_attribute_values_with_try_get_func(
                result,
                [HEIGHT, WIDTH, LENGTH, DIAMETER],
                type_name,
                map_wd_attribute.try_get_dimension_unit,
            )

            significant_events = map_wd_attribute.try_get_significant_events(
                result)

            artwork_dictionary = {
                ID: qid,
                CLASS[PLURAL]: classes,
                LABEL[SINGULAR]: label,
                DESCRIPTION[SINGULAR]: description,
                IMAGE: image,
                ARTIST[PLURAL]: artists,
                LOCATION[PLURAL]: locations,
                GENRE[PLURAL]: genres,
                MOVEMENT[PLURAL]: movements,
                INCEPTION: inception,
                MATERIAL[PLURAL]: materials,
                MOTIF[PLURAL]: motifs,
                COUNTRY: country,
                HEIGHT: height,
                HEIGHT_UNIT: height_unit,
                WIDTH: width,
                WIDTH_UNIT: width_unit,
                LENGTH: length,
                LENGTH_UNIT: length_unit,
                DIAMETER: diameter,
                DIAMETER_UNIT: diameter_unit,
                ICONCLASS[PLURAL]: iconclasses,
                MAIN_SUBJECT[PLURAL]: main_subjects,
                EXHIBITION_HISTORY: exhibition_history,
                SIGNIFICANT_EVENT: significant_events,
                TYPE: ARTWORK[SINGULAR],
            }

            # Apply blocklist to artwork dictionary
            for t in [
                    CLASS[PLURAL], ARTIST[PLURAL], LOCATION[PLURAL],
                    GENRE[PLURAL], MOVEMENT[PLURAL], MATERIAL[PLURAL],
                    MOTIF[PLURAL], ICONCLASS[PLURAL], MAIN_SUBJECT[PLURAL],
                    EXHIBITION_HISTORY
            ]:
                try:
                    artwork_dictionary[t] = list(
                        set(artwork_dictionary[t]) - set(BLOCKLIST))
                except Exception as e:
                    logger.exception(e)
                    continue

            for langkey in language_keys:
                label_lang = map_wd_attribute.try_get_label_or_description(
                    result, LABEL[PLURAL], langkey, type_name)
                description_lang = map_wd_attribute.try_get_label_or_description(
                    result, DESCRIPTION[PLURAL], langkey, type_name)
                wikipedia_link_lang = map_wd_attribute.try_get_wikipedia_link(
                    result, langkey, type_name)
                artwork_dictionary.update({
                    f"{LABEL[SINGULAR]}_{langkey}":
                    label_lang,
                    f"{DESCRIPTION[SINGULAR]}_{langkey}":
                    description_lang,
                    f"{WIKIPEDIA_LINK}_{langkey}":
                    wikipedia_link_lang,
                })
            extract_dicts.append(artwork_dictionary)
            already_crawled_wikidata_items.add(qid)

        item_count += len(chunk)
        print(
            f"Status of {type_name}: {item_count}/{len(artwork_ids)}",
            end="\r",
            flush=True,
        )

        chunk_count += 1

    print(datetime.datetime.now(), "Finished with", type_name)
    return extract_dicts