with open( (create_new_path(filename)).with_suffix(f".{JSON}"), "w", newline="", encoding="utf-8", ) as file: json.dump(items, file, ensure_ascii=False) write_state(ETL_STATES.GET_WIKIPEDIA_EXTRACTS.EXTRACT_ABSTRACTS + filename) except Exception as error: print( f"Error when opening following file: {filename}. Error: {error}. Skipping file now." ) continue print( datetime.datetime.now(), "Finished extracting wikipedia extracts with", filename, ) if __name__ == "__main__": if len(sys.argv) > 1 and "-r" in sys.argv: RECOVER_MODE = True if RECOVER_MODE and check_state(ETL_STATES.GET_WIKIPEDIA_EXTRACTS.STATE): exit(0) logger.info("Extracting Wikipedia Abstracts") add_wikipedia_extracts() write_state(ETL_STATES.GET_WIKIPEDIA_EXTRACTS.STATE)
def add_wikipedia_extracts(language_keys: Optional[List[str]] = lang_keys, ) -> None: """Add the wikipedia extracts to the already existing files Args: language_keys: Language keys to extract wikipedia abstracts for. Defaults to languageconfig.csv """ for filename in [ ARTWORK[PLURAL], MOTIF[PLURAL], GENRE[PLURAL], MATERIAL[PLURAL], MOVEMENT[PLURAL], ARTIST[PLURAL], LOCATION[PLURAL], CLASS[PLURAL], ]: print( datetime.datetime.now(), "Starting extracting wikipedia extracts with", filename, ) try: with open( (create_new_path(filename)).with_suffix(f".{JSON}"), encoding="utf-8" ) as file: if RECOVER_MODE and check_state(ETL_STATES.GET_WIKIPEDIA_EXTRACTS.EXTRACT_ABSTRACTS + filename): continue items = json.load(file) for key in language_keys: item_indices_with_wiki_link_for_lang = [ items.index(item) for item in items if item[f"{WIKIPEDIA_LINK}_{key}"] != "" ] print( f"There are {len(item_indices_with_wiki_link_for_lang)} {key}.wikipedia links within the {len(items)} {filename} items" ) # retry operation until its done done = False # ToDo: The limit for extracts seems to be 20, there is an excontinue parameter which # could be used to increase the performance and load more at once (50 is allowed by the API) if needed # The request method has to be adjusted for this # Further information https://stackoverflow.com/questions/9846795/prop-extracts-not-returning-all-extracts-in-the-wikimedia-api chunk_size = 20 while not done: try: item_indices_chunks = chunks( item_indices_with_wiki_link_for_lang, chunk_size ) extracted_count = 0 # Fill json objects without wikilink to an abstract with empty key-value pairs (could be removed if frontend is adjusted) for j in range(len(items)): if j not in item_indices_with_wiki_link_for_lang: items[j][f"{ABSTRACT}_{key}"] = "" for chunk in item_indices_chunks: # Get PageIds from URL https://en.wikipedia.org/w/api.php?action=query&titles=Jean_Wauquelin_presenting_his_'Chroniques_de_Hainaut'_to_Philip_the_Good page_id_indices_dictionary = get_wikipedia_page_ids( items, chunk, key ) # Get Extracts from PageId https://en.wikipedia.org/w/api.php?format=json&action=query&prop=extracts&exintro&explaintext&pageids=70889|1115370 raw_response = get_wikipedia_extracts( items, page_id_indices_dictionary, key ) # add extracted abstracts to json objects for i in chunk: items[i][f"{ABSTRACT}_{key}"] = raw_response[i] extracted_count += len(chunk) print( f"Extracts for {filename} and language {key} status: {extracted_count}/{len(item_indices_with_wiki_link_for_lang)}", end="\r", flush=True, ) # If a chunk is finished and the chunk size is < 20 (e.g. the previous chunk failed but the current one succeeded): increase the chunk size chunk_size = chunk_size + 5 if chunk_size < 20 else chunk_size # set done to true after all items have been processed done = True except Exception as error: logger.error(f"Fetching wikipedia extracs for {filename}, lang:{key} and chunk size:{chunk_size} failed!") logger.error(error) # lower chunk size and try again in while loop chunk_size -= 5 if chunk_size > 0: logger.info(f"Trying the wikipedia extracts again with chunk size:{chunk_size}") continue else: logger.exception(error) raise error # overwrite file with open( (create_new_path(filename)).with_suffix(f".{JSON}"), "w", newline="", encoding="utf-8", ) as file: json.dump(items, file, ensure_ascii=False) write_state(ETL_STATES.GET_WIKIPEDIA_EXTRACTS.EXTRACT_ABSTRACTS + filename) except Exception as error: print( f"Error when opening following file: {filename}. Error: {error}. Skipping file now." ) continue print( datetime.datetime.now(), "Finished extracting wikipedia extracts with", filename, )
and inceptions[movement["id"]]["start"] < movement["start_time"]): movement["start_time_est"] = inceptions[ movement["id"]]["start"] else: movement["start_time_est"] = "" if (not movement["end_time"] and (not movement["start_time"] or inceptions[movement["id"]]["end"] > movement["start_time"]) ) or (movement["end_time"] and inceptions[movement["id"]]["end"] > movement["end_time"]): movement["end_time_est"] = inceptions[movement["id"]]["end"] else: movement["end_time_est"] = "" else: movement["start_time_est"] = "" movement["end_time_est"] = "" movements_modified.append(movement) with open(movements_output_file, "w", newline="", encoding="utf-8") as file: json.dump(movements_modified, file, ensure_ascii=False, cls=DecimalEncoder) write_state(ETL_STATES.DATA_TRANSFORMATION.ESTIMATE_MOVEMENT_PERIOD) # print('took ', datetime.now() - start)
return movements if __name__ == "__main__": if len(sys.argv) > 1 and "-r" in sys.argv: RECOVER_MODE = True if RECOVER_MODE and check_state( ETL_STATES.DATA_TRANSFORMATION.HAS_PART_PART_OF_ENHANCEMENT): exit(0) print("Starting part of, has part enhancement on movements", datetime.datetime.now()) movements_file = (Path(__file__).resolve().parent.parent / "crawler_output" / "intermediate_files" / "json" / "movements.json") with open(movements_file, encoding="utf-8") as file: movements = json.load(file) movements = inverse_attribute_enhancement(HAS_PART, PART_OF, movements) movements = inverse_attribute_enhancement(PART_OF, HAS_PART, movements) with open(movements_file, "w", newline="", encoding="utf-8") as file: file.write(json.dumps(movements, ensure_ascii=False)) print("Finished part of, has part enhancement on movements", datetime.datetime.now()) write_state(ETL_STATES.DATA_TRANSFORMATION.HAS_PART_PART_OF_ENHANCEMENT)
"Starting with splitting art_ontology.json to its language files", ) for lang_key in language_keys: with open(art_ontology_file, encoding="utf-8") as json_file: art_ontology = ijson.items(json_file, 'item') art_ontology_for_lang = [] print(f"Start generating art_ontology_{lang_key}.{JSON}") for item in art_ontology: if is_jsonable(item): art_ontology_for_lang.append( modify_langdict(item, lang_key)) art_ontology_for_lang = remove_language_key_attributes_in_exhibitions( art_ontology_for_lang) generate_json( art_ontology_for_lang, Path(__file__).resolve().parent.parent / CRAWLER_OUTPUT / f"art_ontology_{lang_key}", ) json_file.close() print(f"Finished generating art_ontology_{lang_key}.{JSON}") print( datetime.datetime.now(), "Finished with splitting art_ontology.json to its language files", ) write_state(ETL_STATES.DATA_TRANSFORMATION.SPLIT_LANGUAGES, Path(__file__).parent.parent)
def extract_art_ontology() -> None: """Extracts *.csv and *.json files for artworks and subjects (e. g. motifs, movements) from wikidata """ # Array of already crawled wikidata items already_crawled_wikidata_items = set(BLOCKLIST) for source in SOURCE_TYPES if not TEST_MODE else SOURCE_TYPES[:CLASS_LIM]: if RECOVER_MODE and check_state(ETL_STATES.GET_WIKIDATA_ITEMS.EXTRACT_SOURCE + source[PLURAL]): continue extracted_artwork = load_wd_entities.extract_artworks( source[PLURAL], source[ID], already_crawled_wikidata_items, DEV, DEV_CHUNK_LIMIT ) path_name = create_new_path(ARTWORK[PLURAL], source[PLURAL], CSV) generate_csv(extracted_artwork, get_fields(source[PLURAL]), path_name) path_name = create_new_path(ARTWORK[PLURAL], source[PLURAL], JSON) generate_json(extracted_artwork, path_name) write_state(ETL_STATES.GET_WIKIDATA_ITEMS.EXTRACT_SOURCE + source[PLURAL]) if RECOVER_MODE and check_state(ETL_STATES.GET_WIKIDATA_ITEMS.MERGED_ARTWORKS): return merged_artworks = merge_artworks() path_name = create_new_path(ARTWORK[PLURAL], file_type=CSV) # Get motifs and main subjects motifs = load_wd_entities.extract_motifs_and_main_subjects(merged_artworks) [motif.update({TYPE: MOTIF[SINGULAR]}) for motif in motifs] # Get extracted genres, materials, etc. ( genres, materials, movements, artists, locations, classes, ) = load_wd_entities.bundle_extract_subjects_calls( [ GENRE[PLURAL], MATERIAL[PLURAL], MOVEMENT[PLURAL], ARTIST[PLURAL], LOCATION[PLURAL], CLASS[PLURAL], ], merged_artworks, ) print("Total movements after transitive closure loading: ", len(movements)) for subject, type_name in [ (genres, GENRE[SINGULAR]), (materials, MATERIAL[SINGULAR]), (movements, MOVEMENT[SINGULAR]), (artists, ARTIST[SINGULAR]), (locations, LOCATION[SINGULAR]), (classes, CLASS[SINGULAR]), ]: [entity.update({TYPE: type_name}) for entity in subject] # Get distinct classes from artworks, motifs, etc. extracted_classes = load_wd_entities.get_distinct_extracted_classes( merged_artworks, motifs, genres, materials, movements, artists, locations, classes, ) [c.update({TYPE: CLASS[SINGULAR]}) for c in extracted_classes] # Add the "subclass_of" parameter from the extracted_classes to the crawled classes existing_classes = [] for class_itm in classes: extracted_class = [d for i, d in enumerate(extracted_classes) if class_itm[ID] in d.values()] class_itm.update({SUBCLASS_OF: extracted_class[0][SUBCLASS_OF]}) if len(extracted_class) > 0 else "" existing_classes.append(class_itm[ID]) # append classes that are missing from our first list for class_itm in extracted_classes: if class_itm[ID] not in existing_classes: classes.append(class_itm) print("Total classes after transitive closure loading: ", len(classes)) # Get country labels for merged artworks and locations ( locations, merged_artworks, movements, ) = load_wd_entities.get_country_labels_for_merged_artworks_and_locations( locations, merged_artworks, movements ) # Get labels for artists artists = load_wd_entities.get_labels_for_artists( artists, [GENDER, PLACE_OF_BIRTH, PLACE_OF_DEATH, CITIZENSHIP] ) # Get unit symbols from qid for artworks distinct_unit_qids = load_wd_entities.get_distinct_unit_symbol_qids(merged_artworks) unit_symbols = load_wd_entities.get_unit_symbols(distinct_unit_qids) load_wd_entities.resolve_unit_id_to_unit_symbol(merged_artworks, unit_symbols) # Get exhibition histories as subdict merged_artworks = load_wd_entities.resolve_exhibition_ids_to_exhibition_entities( merged_artworks ) # Significant events as subdict merged_artworks = load_wd_entities.resolve_significant_event_id_entities_to_labels( merged_artworks ) # Write to JSON write_data_to_json_and_csv( motifs, genres, extracted_classes, materials, movements, locations, merged_artworks, artists, classes, ) write_state(ETL_STATES.GET_WIKIDATA_ITEMS.MERGED_ARTWORKS)
classes, ) write_state(ETL_STATES.GET_WIKIDATA_ITEMS.MERGED_ARTWORKS) if __name__ == "__main__": if len(sys.argv) > 1: print(sys.argv) dev_count_set = False if "-d" in sys.argv: if len(sys.argv) > sys.argv.index('-d') + 1 and any([c.isdigit() for c in sys.argv]): DEV_CHUNK_LIMIT = int(sys.argv[sys.argv.index('-d') + 1]) dev_count_set = True print("DEV MODE: on, DEV_LIM={0}".format(DEV_CHUNK_LIMIT)) DEV = True if "-r" in sys.argv: RECOVER_MODE = True if "-t" in sys.argv: if len(sys.argv) > sys.argv.index('-t') + 1 and sys.argv[sys.argv.index('-t') + 1].isdigit(): CLASS_LIM = int(sys.argv[sys.argv.index('-t') + 1]) print("TEST MODE: on, CLASS_LIM={0}".format(CLASS_LIM)) TEST_MODE = True DEV = True if not dev_count_set: DEV_CHUNK_LIMIT = 3 if RECOVER_MODE and check_state(ETL_STATES.GET_WIKIDATA_ITEMS.STATE): exit(0) logger.info("Extracting Art Ontology") extract_art_ontology() write_state(ETL_STATES.GET_WIKIDATA_ITEMS.STATE)
datetime.datetime.now(), f"Starting with adding youtube videos for file: {entity_type}", ) try: # Open file with open((create_new_path(entity_type)).with_suffix(f".{JSON}"), encoding="utf-8") as file: items = json.load(file) entities = add_youtube_videos(items, check_ids=check) # Overwrite file with open( (create_new_path(entity_type)).with_suffix(f".{JSON}"), "w", newline="", encoding="utf-8", ) as file: json.dump(entities, file, ensure_ascii=False) except Exception as error: logger.error( f"Error when opening following file: {entity_type}. Skipping file now.\nError:" ) logger.exception(error) continue print( datetime.datetime.now(), f"Finished adding youtube videos for file: {entity_type}", ) write_state(ETL_STATES.DATA_TRANSFORMATION.ADD_YOUTUBE_VIDEOS)
try: # Read in file with open((create_new_path(filename)).with_suffix(f".{JSON}"), encoding="utf-8") as file: if filename is ARTWORK[PLURAL]: artworks = out_file = rank_artworks(json.load(file)) else: out_file = rank_subjects(filename, json.load(file), artworks) # Overwrite file # TODO if merging is done with sth else as js script than overwrite current file with open( (create_new_path(filename)).with_suffix(f".{JSON}"), "w", newline="", encoding="utf-8", ) as file: json.dump(out_file, file, ensure_ascii=False) print( datetime.datetime.now(), "Finished ranking with", filename, ) except Exception as error: logger.error( f"Error when opening following file: {filename}. Skipping file now.\nError:" ) logger.exception(error) continue write_state(ETL_STATES.DATA_TRANSFORMATION.RANKING)