def get_unit_symbols(qids: List[str]) -> List[Dict]: """Function to get the unit symbols from the unit entities Args: qids: List of qids Returns: List of dicts containing the unit id and their unit symbol in english language """ print(datetime.datetime.now(), f"Starting with unit symbols") print(f"Total unit symbols to extract: {len(qids)}") item_count = 0 extract_dicts = [] chunk_size = 50 # The chunksize 50 is allowed by the wikidata api, bigger numbers need special permissions id_chunks = chunks(list(qids), chunk_size) for chunk in id_chunks: query_result = wikidata_entity_request(chunk, props=[CLAIMS], timeout=10) if ENTITIES not in query_result: logger.error("Skipping chunk") continue for result in query_result[ENTITIES].values(): try: qid = result[ID] except Exception as error: logger.error( "Error on qid, skipping item. Error: {0}".format(error)) continue unit_symbol = map_wd_attribute.try_get_unit_symbol( result, PROPERTY_NAME_TO_PROPERTY_ID[UNIT_SYMBOL], UNIT_SYMBOL) subject_dict = {ID: qid, UNIT_SYMBOL: unit_symbol} extract_dicts.append(subject_dict) item_count += len(chunk) print(f"Status of unit symbols: {item_count}/{len(qids)}", end="\r", flush=True) print(datetime.datetime.now(), f"Finished with unit symbols") return extract_dicts
def add_wikipedia_extracts(language_keys: Optional[List[str]] = lang_keys, ) -> None: """Add the wikipedia extracts to the already existing files Args: language_keys: Language keys to extract wikipedia abstracts for. Defaults to languageconfig.csv """ for filename in [ ARTWORK[PLURAL], MOTIF[PLURAL], GENRE[PLURAL], MATERIAL[PLURAL], MOVEMENT[PLURAL], ARTIST[PLURAL], LOCATION[PLURAL], CLASS[PLURAL], ]: print( datetime.datetime.now(), "Starting extracting wikipedia extracts with", filename, ) try: with open( (create_new_path(filename)).with_suffix(f".{JSON}"), encoding="utf-8" ) as file: if RECOVER_MODE and check_state(ETL_STATES.GET_WIKIPEDIA_EXTRACTS.EXTRACT_ABSTRACTS + filename): continue items = json.load(file) for key in language_keys: item_indices_with_wiki_link_for_lang = [ items.index(item) for item in items if item[f"{WIKIPEDIA_LINK}_{key}"] != "" ] print( f"There are {len(item_indices_with_wiki_link_for_lang)} {key}.wikipedia links within the {len(items)} {filename} items" ) # retry operation until its done done = False # ToDo: The limit for extracts seems to be 20, there is an excontinue parameter which # could be used to increase the performance and load more at once (50 is allowed by the API) if needed # The request method has to be adjusted for this # Further information https://stackoverflow.com/questions/9846795/prop-extracts-not-returning-all-extracts-in-the-wikimedia-api chunk_size = 20 while not done: try: item_indices_chunks = chunks( item_indices_with_wiki_link_for_lang, chunk_size ) extracted_count = 0 # Fill json objects without wikilink to an abstract with empty key-value pairs (could be removed if frontend is adjusted) for j in range(len(items)): if j not in item_indices_with_wiki_link_for_lang: items[j][f"{ABSTRACT}_{key}"] = "" for chunk in item_indices_chunks: # Get PageIds from URL https://en.wikipedia.org/w/api.php?action=query&titles=Jean_Wauquelin_presenting_his_'Chroniques_de_Hainaut'_to_Philip_the_Good page_id_indices_dictionary = get_wikipedia_page_ids( items, chunk, key ) # Get Extracts from PageId https://en.wikipedia.org/w/api.php?format=json&action=query&prop=extracts&exintro&explaintext&pageids=70889|1115370 raw_response = get_wikipedia_extracts( items, page_id_indices_dictionary, key ) # add extracted abstracts to json objects for i in chunk: items[i][f"{ABSTRACT}_{key}"] = raw_response[i] extracted_count += len(chunk) print( f"Extracts for {filename} and language {key} status: {extracted_count}/{len(item_indices_with_wiki_link_for_lang)}", end="\r", flush=True, ) # If a chunk is finished and the chunk size is < 20 (e.g. the previous chunk failed but the current one succeeded): increase the chunk size chunk_size = chunk_size + 5 if chunk_size < 20 else chunk_size # set done to true after all items have been processed done = True except Exception as error: logger.error(f"Fetching wikipedia extracs for {filename}, lang:{key} and chunk size:{chunk_size} failed!") logger.error(error) # lower chunk size and try again in while loop chunk_size -= 5 if chunk_size > 0: logger.info(f"Trying the wikipedia extracts again with chunk size:{chunk_size}") continue else: logger.exception(error) raise error # overwrite file with open( (create_new_path(filename)).with_suffix(f".{JSON}"), "w", newline="", encoding="utf-8", ) as file: json.dump(items, file, ensure_ascii=False) write_state(ETL_STATES.GET_WIKIPEDIA_EXTRACTS.EXTRACT_ABSTRACTS + filename) except Exception as error: print( f"Error when opening following file: {filename}. Error: {error}. Skipping file now." ) continue print( datetime.datetime.now(), "Finished extracting wikipedia extracts with", filename, )
def get_exhibition_entities( qids: Set[str], language_keys: Optional[List[str]] = lang_keys, type_name: str = EXHIBITION, ) -> Dict[str, Dict]: """Function to get the exhibition entities from wikidata Args: qids: Distinct qid set to get the entities from language_keys: Language keys to extract label and description from. Defaults to languageconfig.csv type_name: OAB type name. Defaults to EXHIBITION. Returns: A dict with the qids as key and the JSON object as value """ print(datetime.datetime.now(), f"Starting with exhibition entities") print(f"Total exhibition entities to extract: {len(qids)}") item_count = 0 extract_dicts = {} chunk_size = 50 # The chunksize 50 is allowed by the wikidata api, bigger numbers need special permissions id_chunks = chunks(list(qids), chunk_size) for chunk in id_chunks: query_result = wikidata_entity_request(chunk) for result in query_result[ENTITIES].values(): try: qid = result[ID] except Exception as error: logger.error( "Error on qid, skipping item. Error: {0}".format(error)) continue label = map_wd_attribute.try_get_label_or_description( result, LABEL[PLURAL], EN, type_name) description = map_wd_attribute.try_get_label_or_description( result, DESCRIPTION[PLURAL], EN, type_name) start_time = map_wd_attribute.try_get_year_from_property_timestamp( result, PROPERTY_NAME_TO_PROPERTY_ID[START_TIME], type_name) end_time = map_wd_attribute.try_get_year_from_property_timestamp( result, PROPERTY_NAME_TO_PROPERTY_ID[END_TIME], type_name) extract_dicts.update({ qid: { LABEL[SINGULAR]: label, DESCRIPTION[SINGULAR]: description, START_TIME: start_time, END_TIME: end_time, TYPE: EXHIBITION, } }) for langkey in language_keys: label_lang = map_wd_attribute.try_get_label_or_description( result, LABEL[PLURAL], langkey, type_name) description_lang = map_wd_attribute.try_get_label_or_description( result, DESCRIPTION[PLURAL], langkey, type_name) extract_dicts[qid][f"{LABEL[SINGULAR]}_{langkey}"] = label_lang extract_dicts[qid][ f"{DESCRIPTION[SINGULAR]}_{langkey}"] = description_lang item_count += len(chunk) print( f"Status of exhibition entities: {item_count}/{len(qids)}", end="\r", flush=True, ) print(datetime.datetime.now(), f"Finished with exhibition entities") return extract_dicts
def get_entity_labels( type_name: str, qids: List[str], language_keys: Optional[List[str]] = lang_keys, ) -> List[Dict]: """Function to get the entity labels from wikidata Args: type_name: oab type e. g. movement qids: List of qids to extract the labels from language_keys: All language keys which should be extracted. Defaults to languageconfig.csv Returns: List of dicts containing the qid and the labels for each language """ print(datetime.datetime.now(), f"Starting with {type_name} {LABEL[PLURAL]}") print(f"Total {type_name} {LABEL[PLURAL]} to extract: {len(qids)}") item_count = 0 extract_dicts = [] chunk_size = 50 # The chunksize 50 is allowed by the wikidata api, bigger numbers need special permissions id_chunks = chunks(list(qids), chunk_size) for chunk in id_chunks: query_result = wikidata_entity_request( chunk, props=[LABEL[PLURAL]], timeout=10) # country entities take longer so timeout is increased if ENTITIES not in query_result: logger.error("Skipping chunk") continue for result in query_result[ENTITIES].values(): try: qid = result[ID] except Exception as error: logger.error( "Error on qid, skipping item. Error: {0}".format(error)) continue label = map_wd_attribute.try_get_label_or_description( result, LABEL[PLURAL], EN, type_name) subject_dict = { ID: qid, LABEL[SINGULAR]: label, } for langkey in language_keys: label_lang = map_wd_attribute.try_get_label_or_description( result, LABEL[PLURAL], langkey, type_name) subject_dict.update( {f"{LABEL[SINGULAR]}_{langkey}": label_lang}) extract_dicts.append(subject_dict) item_count += len(chunk) print( f"Status of {type_name} {LABEL[PLURAL]}: {item_count}/{len(qids)}", end="\r", flush=True, ) print(datetime.datetime.now(), f"Finished with {type_name} {LABEL[PLURAL]}") return extract_dicts
def get_classes( type_name: str, qids: List[str], already_extracted_superclass_ids: Set[str] = set(), language_keys: Optional[List[str]] = lang_keys, ) -> List[Dict]: """Function to extract the classes of the extracted wikidata entities (meaning the 'instance of' attribute wikidata entity qids). Their subclasses are also extracted recursively (also called transitive closure) Args: type_name: oab type e. g. movement qids: List of qids to extract the labels from language_keys: All language keys which should be extracted. Defaults to languageconfig.csv already_extracted_superclass_ids: A list of already extracted superclass ids for the recursive calls, this is also the anchor to stop recursion Returns: Returns a list of dicts with the classes from the oab entities and their subclasses """ print(datetime.datetime.now(), f"Starting with {type_name}") if type_name == CLASS[PLURAL]: print( f"Total {type_name} to extract (only 'instance_of' of the provided qids): {len(qids)}" ) else: print( f"Total {type_name} to extract (only 'subclass_of' of the provided qids): {len(qids)}" ) item_count = 0 extract_dicts = [] chunk_size = 50 # The chunksize 50 is allowed by the wikidata api, bigger numbers need special permissions classes_id_chunks = chunks(list(qids), chunk_size) for chunk in classes_id_chunks: query_result = wikidata_entity_request(chunk) if ENTITIES not in query_result: logger.error("Skipping chunk") continue for result in query_result[ENTITIES].values(): try: qid = result[ID] except Exception as error: logger.error( "Error on qid, skipping item. Error: {0}".format(error)) continue label = map_wd_attribute.try_get_label_or_description( result, LABEL[PLURAL], EN, type_name) description = map_wd_attribute.try_get_label_or_description( result, DESCRIPTION[PLURAL], EN, type_name) subclass_of = map_wd_attribute.try_get_qid_reference_list( result, PROPERTY_NAME_TO_PROPERTY_ID[SUBCLASS_OF], type_name) class_dict = { ID: qid, LABEL[SINGULAR]: label, DESCRIPTION[SINGULAR]: description, SUBCLASS_OF: subclass_of, } for langkey in language_keys: label_lang = map_wd_attribute.try_get_label_or_description( result, LABEL[PLURAL], langkey, type_name) description_lang = map_wd_attribute.try_get_label_or_description( result, DESCRIPTION[PLURAL], langkey, type_name) class_dict.update({ f"{LABEL[SINGULAR]}_{langkey}": label_lang, f"{DESCRIPTION[SINGULAR]}_{langkey}": description_lang, }) extract_dicts.append(class_dict) item_count += len(chunk) print(f"Status of {type_name}: {item_count}/{len(qids)}", end="\r", flush=True) return load_entities_by_attribute_with_transitive_closure( extract_dicts, SUBCLASS_OF, CLASS[PLURAL], already_extracted_superclass_ids, get_classes, [], )
def get_subject( type_name: str, qids: List[str], already_extracted_movement_ids: Set[str] = set(), language_keys: Optional[List[str]] = lang_keys, ) -> List[Dict]: """Extract subjects (in our definition everything except artworks e. g. movements, motifs, etc.) from wikidata Args: type_name: oab type name e. g. movements (Caution type names are always plural here) qids: A list of qids extracted from the artworks language_keys: All language keys which should be extracted. Defaults to languageconfig.csv Returns: A list of dicts with the subjects transformed from wikidata entities to oab entities """ print(datetime.datetime.now(), f"Starting with {type_name}") print(f"Total {type_name} to extract: {len(qids)}") item_count = 0 extract_dicts = [] chunk_size = 50 # The chunksize 50 is allowed by the wikidata api, bigger numbers need special permissions subject_id_chunks = chunks(list(qids), chunk_size) for chunk in subject_id_chunks: query_result = wikidata_entity_request(chunk) if ENTITIES not in query_result: logger.error("Skipping chunk") continue for result in query_result[ENTITIES].values(): subject_dict = map_wd_response.try_map_response_to_subject( result, type_name) if subject_dict is None: continue if type_name == MOVEMENT[PLURAL] or type_name == ARTIST[PLURAL]: influenced_by = map_wd_attribute.try_get_qid_reference_list( result, PROPERTY_NAME_TO_PROPERTY_ID[INFLUENCED_BY], type_name) subject_dict.update({INFLUENCED_BY: influenced_by}) if type_name == MOVEMENT[PLURAL]: subject_dict.update( map_wd_response.try_map_response_to_movement(result)) already_extracted_movement_ids.add(subject_dict[ID]) if type_name == ARTIST[PLURAL]: subject_dict.update( map_wd_response.try_map_response_to_artist(result)) if type_name == LOCATION[PLURAL]: subject_dict.update( map_wd_response.try_map_response_to_location(result)) extract_dicts.append(subject_dict) item_count += len(chunk) print(f"Status of {type_name}: {item_count}/{len(qids)}", end="\r", flush=True) if type_name == MOVEMENT[PLURAL]: extract_dicts = load_entities_by_attribute_with_transitive_closure( extract_dicts, PART_OF, MOVEMENT[PLURAL], already_extracted_movement_ids, get_subject, [ART_MOVEMENT[ID], ART_STYLE[ID]], ) extract_dicts = load_entities_by_attribute_with_transitive_closure( extract_dicts, HAS_PART, MOVEMENT[PLURAL], already_extracted_movement_ids, get_subject, [ART_MOVEMENT[ID], ART_STYLE[ID]], ) return extract_dicts print(datetime.datetime.now(), f"Finished with {type_name}") return extract_dicts
def extract_artworks( type_name: str, wikidata_id: str, already_crawled_wikidata_items: Set, dev_mode: bool, dev_chunk_limit: int, language_keys: Optional[List[str]] = lang_keys, ) -> List[Dict]: """Extracts artworks metadata from Wikidata and stores them in a dictionary. Args: type_name: Type name of an artwork e. g. 'drawings'. Important for console output wikidata_id: Wikidata Id of a class; all instances of this class and all subclasses with image will be loaded. See artworks_ids_query.sparql already_crawled_wikidata_items: Set of all already crawled artwork items. Because the types have common items it is necessary to avoid loading items multiple times language_keys: All language keys which should be extracted. Defaults to languageconfig.csv dev_mode: To reduce the number of loaded chunks set this to true dev_chunk_limit: Limit of chunks per category Returns: A list with all artwork entity dicts (or JSON-objects) which are transformed for the OAB Examples: extract_artworks('drawings', 'wd:Q93184', '('en', 'de')) extract_artworks('sculptures', 'wd:Q860861', '('en', 'de')) extract_artworks('paintings', 'wd:Q3305213', '('en', 'de')) """ print(datetime.datetime.now(), "Starting with", type_name) extract_dicts = [] chunk_count = 0 item_count = 0 artwork_ids = query_artwork_qids(type_name, wikidata_id) # Don't load items again, if they were loaded in another artwork category for artwork_id in artwork_ids: if artwork_id in already_crawled_wikidata_items: artwork_ids.remove(artwork_id) print( f"{len(artwork_ids)} {type_name} entries are not loaded yet, starting now. Already crawled item count is {len(already_crawled_wikidata_items)}" ) chunk_size = 50 # The chunksize 50 is allowed by the wikidata api, bigger numbers need special permissions artwork_id_chunks = chunks(artwork_ids, chunk_size) for chunk in artwork_id_chunks: if dev_mode and chunk_count == dev_chunk_limit: logger.info( f"DEV_CHUNK_LIMIT of {type_name} reached. End extraction for {type_name}" ) break query_result = wikidata_entity_request(chunk) if ENTITIES not in query_result: logger.error("Skipping chunk") continue for result in query_result[ENTITIES].values(): try: qid = result[ID] image = map_wd_attribute.get_image_url_by_name( result[CLAIMS][PROPERTY_NAME_TO_PROPERTY_ID[IMAGE]][0] [MAINSNAK][DATAVALUE][VALUE]) except Exception as error: logger.error( "Error on qid or image, skipping item. Qid: {0}, Image: {1}, Error: {2}" .format(qid, image, error)) continue label = map_wd_attribute.try_get_label_or_description( result, LABEL[PLURAL], EN, type_name) description = map_wd_attribute.try_get_label_or_description( result, DESCRIPTION[PLURAL], EN, type_name) ( classes, artists, locations, genres, movements, materials, motifs, main_subjects, exhibition_history, ) = map_wd_attribute.get_attribute_values_with_try_get_func( result, [ CLASS[SINGULAR], ARTIST[SINGULAR], LOCATION[SINGULAR], GENRE[SINGULAR], MOVEMENT[SINGULAR], MATERIAL[SINGULAR], MOTIF[SINGULAR], MAIN_SUBJECT[SINGULAR], EXHIBITION_HISTORY, ], type_name, map_wd_attribute.try_get_qid_reference_list, ) iconclasses = map_wd_attribute.try_get_value_list( result, PROPERTY_NAME_TO_PROPERTY_ID[ICONCLASS[SINGULAR]], type_name) inception = map_wd_attribute.try_get_year_from_property_timestamp( result, PROPERTY_NAME_TO_PROPERTY_ID[INCEPTION], type_name) country = map_wd_attribute.try_get_first_qid( result, PROPERTY_NAME_TO_PROPERTY_ID[COUNTRY], type_name) # Resolve dimensions # The units are qids which have to be resolved later ( height, width, length, diameter, ) = map_wd_attribute.get_attribute_values_with_try_get_func( result, [HEIGHT, WIDTH, LENGTH, DIAMETER], type_name, map_wd_attribute.try_get_dimension_value, ) ( height_unit, width_unit, length_unit, diameter_unit, ) = map_wd_attribute.get_attribute_values_with_try_get_func( result, [HEIGHT, WIDTH, LENGTH, DIAMETER], type_name, map_wd_attribute.try_get_dimension_unit, ) significant_events = map_wd_attribute.try_get_significant_events( result) artwork_dictionary = { ID: qid, CLASS[PLURAL]: classes, LABEL[SINGULAR]: label, DESCRIPTION[SINGULAR]: description, IMAGE: image, ARTIST[PLURAL]: artists, LOCATION[PLURAL]: locations, GENRE[PLURAL]: genres, MOVEMENT[PLURAL]: movements, INCEPTION: inception, MATERIAL[PLURAL]: materials, MOTIF[PLURAL]: motifs, COUNTRY: country, HEIGHT: height, HEIGHT_UNIT: height_unit, WIDTH: width, WIDTH_UNIT: width_unit, LENGTH: length, LENGTH_UNIT: length_unit, DIAMETER: diameter, DIAMETER_UNIT: diameter_unit, ICONCLASS[PLURAL]: iconclasses, MAIN_SUBJECT[PLURAL]: main_subjects, EXHIBITION_HISTORY: exhibition_history, SIGNIFICANT_EVENT: significant_events, TYPE: ARTWORK[SINGULAR], } # Apply blocklist to artwork dictionary for t in [ CLASS[PLURAL], ARTIST[PLURAL], LOCATION[PLURAL], GENRE[PLURAL], MOVEMENT[PLURAL], MATERIAL[PLURAL], MOTIF[PLURAL], ICONCLASS[PLURAL], MAIN_SUBJECT[PLURAL], EXHIBITION_HISTORY ]: try: artwork_dictionary[t] = list( set(artwork_dictionary[t]) - set(BLOCKLIST)) except Exception as e: logger.exception(e) continue for langkey in language_keys: label_lang = map_wd_attribute.try_get_label_or_description( result, LABEL[PLURAL], langkey, type_name) description_lang = map_wd_attribute.try_get_label_or_description( result, DESCRIPTION[PLURAL], langkey, type_name) wikipedia_link_lang = map_wd_attribute.try_get_wikipedia_link( result, langkey, type_name) artwork_dictionary.update({ f"{LABEL[SINGULAR]}_{langkey}": label_lang, f"{DESCRIPTION[SINGULAR]}_{langkey}": description_lang, f"{WIKIPEDIA_LINK}_{langkey}": wikipedia_link_lang, }) extract_dicts.append(artwork_dictionary) already_crawled_wikidata_items.add(qid) item_count += len(chunk) print( f"Status of {type_name}: {item_count}/{len(artwork_ids)}", end="\r", flush=True, ) chunk_count += 1 print(datetime.datetime.now(), "Finished with", type_name) return extract_dicts
def add_wikipedia_extracts( language_keys: Optional[List[str]] = lang_keys, ) -> None: """Add the wikipedia extracts to the already existing files Args: language_keys: Language keys to extract wikipedia abstracts for. Defaults to languageconfig.csv """ for filename in [ ARTWORK[PLURAL], MOTIF[PLURAL], GENRE[PLURAL], MATERIAL[PLURAL], MOVEMENT[PLURAL], ARTIST[PLURAL], LOCATION[PLURAL], ]: print( datetime.datetime.now(), "Starting extracting wikipedia extracts with", filename, ) try: with open((create_new_path(filename)).with_suffix(f".{JSON}"), encoding="utf-8") as file: items = json.load(file) for key in language_keys: item_indices_with_wiki_link_for_lang = [ items.index(item) for item in items if item[f"{WIKIPEDIA_LINK}_{key}"] != "" ] print( f"There are {len(item_indices_with_wiki_link_for_lang)} {key}.wikipedia links within the {len(items)} {filename} items" ) # ToDo: The limit for extracts seems to be 20, there is an excontinue parameter which # could be used to increase the performance and load more at once (50 is allowed by the API) if needed # The request method has to be adjusted for this # Further information https://stackoverflow.com/questions/9846795/prop-extracts-not-returning-all-extracts-in-the-wikimedia-api chunk_size = 20 item_indices_chunks = chunks( item_indices_with_wiki_link_for_lang, chunk_size) extracted_count = 0 # Fill json objects without wikilink to an abstract with empty key-value pairs (could be removed if frontend is adjusted) for j in range(len(items)): if j not in item_indices_with_wiki_link_for_lang: items[j][f"{ABSTRACT}_{key}"] = "" for chunk in item_indices_chunks: # Get PageIds from URL https://en.wikipedia.org/w/api.php?action=query&titles=Jean_Wauquelin_presenting_his_'Chroniques_de_Hainaut'_to_Philip_the_Good page_id_indices_dictionary = get_wikipedia_page_ids( items, chunk, key) # Get Extracts from PageId https://en.wikipedia.org/w/api.php?format=json&action=query&prop=extracts&exintro&explaintext&pageids=70889|1115370 raw_response = get_wikipedia_extracts( items, page_id_indices_dictionary, key) # add extracted abstracts to json objects for i in chunk: items[i][f"{ABSTRACT}_{key}"] = raw_response[i] extracted_count += len(chunk) print( f"Extracts for {filename} and language {key} status: {extracted_count}/{len(item_indices_with_wiki_link_for_lang)}", end="\r", flush=True, ) # overwrite file with open( (create_new_path(filename)).with_suffix(f".{JSON}"), "w", newline="", encoding="utf-8", ) as file: file.write(json.dumps(items, ensure_ascii=False)) except Exception as error: print( f"Error when opening following file: {filename}. Error: {error}. Skipping file now." ) continue print( datetime.datetime.now(), "Finished extracting wikipedia extracts with", filename, )