Пример #1
0
def merge_artworks() -> List[Dict]:
    """Merges artworks from files 'paintings.json', 'drawings.json',
    'sculptures.json' (function extract_artworks) and
    stores them in a dictionary

    Returns:
        A list of dictionaries containing all artworks
    """
    print(datetime.datetime.now(), "Starting with", "merging artworks")
    artworks = set()
    file_names = [
        f"{PAINTING[PLURAL]}.{JSON}",
        f"{DRAWING[PLURAL]}.{JSON}",
        f"{SCULPTURE[PLURAL]}.{JSON}",
    ]
    file_names = [
        create_new_path(ARTWORK[PLURAL], subpath=file_name)
        for file_name in file_names
    ]
    extract_dicts = []

    for file_name in file_names:
        with open(file_name, encoding="utf-8") as input:
            object_array = json.load(input)
            for object in object_array:
                if not object[ID] in artworks:  # remove duplicates
                    object[TYPE] = ARTWORK[SINGULAR]
                    extract_dicts.append(object)
                    artworks.add(object[ID])

    print(datetime.datetime.now(), "Finished with", "merging artworks")
    print()
    return extract_dicts
Пример #2
0
def merge_artworks() -> List[Dict]:
    """Merges artworks from files 'paintings.json', 'drawings.json',
    'sculptures.json' (function extract_artworks) and
    stores them in a dictionary

    Returns:
        A list of dictionaries containing all artworks
    """
    print(datetime.datetime.now(), "Starting with", "merging artworks")
    artworks = set()
    file_names = [f"{source_type[PLURAL]}.{JSON}" for source_type in SOURCE_TYPES]
    file_names = [
        create_new_path(ARTWORK[PLURAL], subpath=file_name) for file_name in file_names
    ]
    extract_dicts = []

    for file_name in file_names:
        try:
            with open(file_name, encoding="utf-8") as input:
                objects = ijson.items(input, 'item')
                object_array = (o for o in objects)
                for object in object_array:
                    if not object[ID] in artworks and is_jsonable(object):  # remove duplicates
                        object[TYPE] = ARTWORK[SINGULAR]
                        extract_dicts.append(object)
                        artworks.add(object[ID])
        except Exception as e:
            logger.error(f"Error when opening following file: {file_name}. Skipping file now.")
            logger.error(e)
            continue
    print(datetime.datetime.now(), "Finished with", "merging artworks")
    print()
    return extract_dicts
Пример #3
0
def add_wikipedia_extracts(language_keys: Optional[List[str]] = lang_keys, ) -> None:
    """Add the wikipedia extracts to the already existing files

    Args:
        language_keys: Language keys to extract wikipedia abstracts for. Defaults to languageconfig.csv
    """
    for filename in [
        ARTWORK[PLURAL],
        MOTIF[PLURAL],
        GENRE[PLURAL],
        MATERIAL[PLURAL],
        MOVEMENT[PLURAL],
        ARTIST[PLURAL],
        LOCATION[PLURAL],
        CLASS[PLURAL],
    ]:
        print(
            datetime.datetime.now(),
            "Starting extracting wikipedia extracts with",
            filename,
        )
        try:
            with open(
                    (create_new_path(filename)).with_suffix(f".{JSON}"), encoding="utf-8"
            ) as file:
                if RECOVER_MODE and check_state(ETL_STATES.GET_WIKIPEDIA_EXTRACTS.EXTRACT_ABSTRACTS + filename):
                    continue
                items = json.load(file)
                for key in language_keys:
                    item_indices_with_wiki_link_for_lang = [
                        items.index(item)
                        for item in items
                        if item[f"{WIKIPEDIA_LINK}_{key}"] != ""
                    ]
                    print(
                        f"There are {len(item_indices_with_wiki_link_for_lang)} {key}.wikipedia links within the {len(items)} {filename} items"
                    )

                    # retry operation until its done
                    done = False
                    # ToDo: The limit for extracts seems to be 20, there is an excontinue parameter which
                    # could be used to increase the performance and load more at once (50 is allowed by the API) if needed
                    # The request method has to be adjusted for this
                    # Further information https://stackoverflow.com/questions/9846795/prop-extracts-not-returning-all-extracts-in-the-wikimedia-api
                    chunk_size = 20

                    while not done:
                        try:
                            item_indices_chunks = chunks(
                                item_indices_with_wiki_link_for_lang, chunk_size
                            )
                            extracted_count = 0
                            # Fill json objects without wikilink to an abstract with empty key-value pairs (could be removed if frontend is adjusted)
                            for j in range(len(items)):
                                if j not in item_indices_with_wiki_link_for_lang:
                                    items[j][f"{ABSTRACT}_{key}"] = ""

                            for chunk in item_indices_chunks:
                                # Get PageIds from URL https://en.wikipedia.org/w/api.php?action=query&titles=Jean_Wauquelin_presenting_his_'Chroniques_de_Hainaut'_to_Philip_the_Good
                                page_id_indices_dictionary = get_wikipedia_page_ids(
                                    items, chunk, key
                                )
                                # Get Extracts from PageId https://en.wikipedia.org/w/api.php?format=json&action=query&prop=extracts&exintro&explaintext&pageids=70889|1115370
                                raw_response = get_wikipedia_extracts(
                                    items, page_id_indices_dictionary, key
                                )
                                # add extracted abstracts to json objects
                                for i in chunk:
                                    items[i][f"{ABSTRACT}_{key}"] = raw_response[i]

                                extracted_count += len(chunk)
                                print(
                                    f"Extracts for {filename} and language {key} status: {extracted_count}/{len(item_indices_with_wiki_link_for_lang)}",
                                    end="\r",
                                    flush=True,
                                )

                                # If a chunk is finished and the chunk size is < 20 (e.g. the previous chunk failed but the current one succeeded): increase the chunk size
                                chunk_size = chunk_size + 5 if chunk_size < 20 else chunk_size


                            # set done to true after all items have been processed
                            done = True
                        except Exception as error:
                            logger.error(f"Fetching wikipedia extracs for {filename}, lang:{key} and chunk size:{chunk_size} failed!")
                            logger.error(error)

                            # lower chunk size and try again in while loop
                            chunk_size -= 5
                            if chunk_size > 0:
                                logger.info(f"Trying the wikipedia extracts again with chunk size:{chunk_size}")
                                continue
                            else:
                                logger.exception(error)
                                raise error

            # overwrite file
            with open(
                    (create_new_path(filename)).with_suffix(f".{JSON}"),
                    "w",
                    newline="",
                    encoding="utf-8",
            ) as file:
                json.dump(items, file, ensure_ascii=False)
            write_state(ETL_STATES.GET_WIKIPEDIA_EXTRACTS.EXTRACT_ABSTRACTS + filename)

        except Exception as error:
            print(
                f"Error when opening following file: {filename}. Error: {error}. Skipping file now."
            )
            continue
        print(
            datetime.datetime.now(),
            "Finished extracting wikipedia extracts with",
            filename,
        )
Пример #4
0
def write_data_to_json_and_csv(
    motifs: List[Dict],
    genres: List[Dict],
    extracted_classes: List[Dict],
    materials: List[Dict],
    movements: List[Dict],
    locations: List[Dict],
    merged_artworks: List[Dict],
    artists: List[Dict],
) -> None:
    """Writes the given lists of dictionaries to json and csv files

    Args:
        motifs: List of motifs
        genres: List of genres
        extracted_classes: List of classes
        materials: List of materials
        movements: List of movements
        locations: List of locations
        merged_artworks: List of artworks
        artists: List of artists
    """
    generate_json(MOTIF[SINGULAR], motifs, create_new_path(MOTIF[PLURAL]))
    generate_csv(
        motifs,
        get_fields(MOTIF[PLURAL]) + [TYPE],
        create_new_path(MOTIF[PLURAL], file_type=CSV),
    )
    generate_json(GENRE[SINGULAR], genres, create_new_path(GENRE[PLURAL]))
    generate_csv(
        genres,
        get_fields(GENRE[PLURAL]) + [TYPE],
        create_new_path(GENRE[PLURAL], file_type=CSV),
    )
    generate_json(CLASS[SINGULAR], extracted_classes,
                  create_new_path(CLASS[PLURAL]))
    generate_csv(
        extracted_classes,
        get_fields(CLASS[PLURAL]) + [TYPE],
        create_new_path(CLASS[PLURAL], file_type=CSV),
    )
    generate_json(MATERIAL[SINGULAR], materials,
                  create_new_path(MATERIAL[PLURAL]))
    generate_csv(
        materials,
        get_fields(MATERIAL[PLURAL]) + [TYPE],
        create_new_path(MATERIAL[PLURAL], file_type=CSV),
    )
    generate_json(MOVEMENT[SINGULAR], movements,
                  create_new_path(MOVEMENT[PLURAL]))
    generate_csv(
        movements,
        get_fields(MOVEMENT[PLURAL]) + [TYPE],
        create_new_path(MOVEMENT[PLURAL], file_type=CSV),
    )
    generate_json(LOCATION[SINGULAR], locations,
                  create_new_path(LOCATION[PLURAL]))
    generate_csv(
        locations,
        get_fields(LOCATION[PLURAL]) + [TYPE],
        create_new_path(LOCATION[PLURAL], file_type=CSV),
    )
    generate_json(ARTWORK[SINGULAR], merged_artworks,
                  create_new_path(ARTWORK[PLURAL]))
    generate_csv(
        merged_artworks,
        get_fields(ARTWORK[PLURAL]) + [TYPE],
        create_new_path(ARTWORK[SINGULAR], file_type=CSV),
    )
    generate_json(ARTIST[SINGULAR], artists, create_new_path(ARTIST[PLURAL]))
    generate_csv(
        artists,
        get_fields(ARTIST[PLURAL]) + [TYPE],
        create_new_path(ARTIST[PLURAL], file_type=CSV),
    )
Пример #5
0
def extract_art_ontology() -> None:
    """Extracts *.csv and *.json files for artworks and subjects (e. g. motifs, movements) from wikidata
    """

    # Array of already crawled wikidata items
    already_crawled_wikidata_items = set()

    for artwork, wd in [
        (DRAWING[PLURAL], DRAWING[ID]),
        (SCULPTURE[PLURAL], SCULPTURE[ID]),
        (PAINTING[PLURAL], PAINTING[ID]),
    ]:
        extracted_artwork = extract_artworks(artwork, wd,
                                             already_crawled_wikidata_items)

        path_name = create_new_path(ARTWORK[PLURAL], artwork, CSV)
        generate_csv(extracted_artwork, get_fields(artwork), path_name)

        path_name = create_new_path(ARTWORK[PLURAL], artwork, JSON)
        generate_json(artwork, extracted_artwork, path_name)

    merged_artworks = merge_artworks()

    path_name = create_new_path(ARTWORK[PLURAL], file_type=CSV)
    generate_csv(
        merged_artworks,
        get_fields(ARTWORK[PLURAL]) + [TYPE],
        path_name,
    )

    # Get motifs and main subjects
    motifs = extract_motifs_and_main_subjects(merged_artworks)

    # Get extracted genres, materials, etc.
    genres, materials, movements, artists, locations = bundle_extract_subjects_calls(
        [
            GENRE[PLURAL],
            MATERIAL[PLURAL],
            MOVEMENT[PLURAL],
            ARTIST[PLURAL],
            LOCATION[PLURAL],
        ],
        merged_artworks,
    )
    print("Total movements after transitive closure loading: ", len(movements))

    # Get distinct classes from artworks, motifs, etc.
    extracted_classes = get_distinct_extracted_classes(
        merged_artworks,
        motifs,
        genres,
        materials,
        movements,
        artists,
        locations,
    )
    print("Total classes after transitive closure loading: ",
          len(extracted_classes))
    # Get country labels for merged artworks and locations
    (
        locations,
        merged_artworks,
        movements,
    ) = get_country_labels_for_merged_artworks_and_locations(
        locations, merged_artworks, movements)

    # Get labels for artists
    artists = get_labels_for_artists(
        artists, [GENDER, PLACE_OF_BIRTH, PLACE_OF_DEATH, CITIZENSHIP])

    # Get unit symbols from qid for artworks
    distinct_unit_qids = get_distinct_unit_symbol_qids(merged_artworks)
    unit_symbols = get_unit_symbols(distinct_unit_qids)
    resolve_unit_id_to_unit_symbol(merged_artworks, unit_symbols)

    # Write to JSON
    write_data_to_json_and_csv(
        motifs,
        genres,
        extracted_classes,
        materials,
        movements,
        locations,
        merged_artworks,
        artists,
    )
Пример #6
0
    print("Added videos for {} entries. Saving the file..".format(
        entries_added_count))

    return entities


if __name__ == "__main__":
    check = "-c" in sys.argv
    for entity_type in [ARTWORK[PLURAL], ARTIST[PLURAL], MOVEMENT[PLURAL]]:
        print(
            datetime.datetime.now(),
            f"Starting with adding youtube videos for file: {entity_type}",
        )
        try:
            # Open file
            with open((create_new_path(entity_type)).with_suffix(f".{JSON}"),
                      encoding="utf-8") as file:
                items = json.load(file)

            entities = add_youtube_videos(items, check_ids=check)

            # Overwrite file
            with open(
                (create_new_path(entity_type)).with_suffix(f".{JSON}"),
                    "w",
                    newline="",
                    encoding="utf-8",
            ) as file:
                file.write(json.dumps(entities, ensure_ascii=False))
        except Exception as error:
            print(
Пример #7
0
def add_wikipedia_extracts(
    language_keys: Optional[List[str]] = lang_keys, ) -> None:
    """Add the wikipedia extracts to the already existing files

    Args:
        language_keys: Language keys to extract wikipedia abstracts for. Defaults to languageconfig.csv
    """
    for filename in [
            ARTWORK[PLURAL],
            MOTIF[PLURAL],
            GENRE[PLURAL],
            MATERIAL[PLURAL],
            MOVEMENT[PLURAL],
            ARTIST[PLURAL],
            LOCATION[PLURAL],
    ]:
        print(
            datetime.datetime.now(),
            "Starting extracting wikipedia extracts with",
            filename,
        )
        try:
            with open((create_new_path(filename)).with_suffix(f".{JSON}"),
                      encoding="utf-8") as file:
                items = json.load(file)
                for key in language_keys:
                    item_indices_with_wiki_link_for_lang = [
                        items.index(item) for item in items
                        if item[f"{WIKIPEDIA_LINK}_{key}"] != ""
                    ]
                    print(
                        f"There are {len(item_indices_with_wiki_link_for_lang)} {key}.wikipedia links within the {len(items)} {filename} items"
                    )
                    # ToDo: The limit for extracts seems to be 20, there is an excontinue parameter which
                    # could be used to increase the performance and load more at once (50 is allowed by the API) if needed
                    # The request method has to be adjusted for this
                    # Further information https://stackoverflow.com/questions/9846795/prop-extracts-not-returning-all-extracts-in-the-wikimedia-api
                    chunk_size = 20
                    item_indices_chunks = chunks(
                        item_indices_with_wiki_link_for_lang, chunk_size)
                    extracted_count = 0
                    # Fill json objects without wikilink to an abstract with empty key-value pairs (could be removed if frontend is adjusted)
                    for j in range(len(items)):
                        if j not in item_indices_with_wiki_link_for_lang:
                            items[j][f"{ABSTRACT}_{key}"] = ""

                    for chunk in item_indices_chunks:
                        # Get PageIds from URL https://en.wikipedia.org/w/api.php?action=query&titles=Jean_Wauquelin_presenting_his_'Chroniques_de_Hainaut'_to_Philip_the_Good
                        page_id_indices_dictionary = get_wikipedia_page_ids(
                            items, chunk, key)
                        # Get Extracts from PageId https://en.wikipedia.org/w/api.php?format=json&action=query&prop=extracts&exintro&explaintext&pageids=70889|1115370
                        raw_response = get_wikipedia_extracts(
                            items, page_id_indices_dictionary, key)
                        # add extracted abstracts to json objects
                        for i in chunk:
                            items[i][f"{ABSTRACT}_{key}"] = raw_response[i]

                        extracted_count += len(chunk)
                        print(
                            f"Extracts for {filename} and language {key} status: {extracted_count}/{len(item_indices_with_wiki_link_for_lang)}",
                            end="\r",
                            flush=True,
                        )

            # overwrite file
            with open(
                (create_new_path(filename)).with_suffix(f".{JSON}"),
                    "w",
                    newline="",
                    encoding="utf-8",
            ) as file:
                file.write(json.dumps(items, ensure_ascii=False))
        except Exception as error:
            print(
                f"Error when opening following file: {filename}. Error: {error}. Skipping file now."
            )
            continue
        print(
            datetime.datetime.now(),
            "Finished extracting wikipedia extracts with",
            filename,
        )
Пример #8
0
def extract_art_ontology() -> None:
    """Extracts *.csv and *.json files for artworks and subjects (e. g. motifs, movements) from wikidata
    """

    # Array of already crawled wikidata items
    already_crawled_wikidata_items = set()

    for artwork, wd in [
        (DRAWING[PLURAL], DRAWING[ID]),
        (SCULPTURE[PLURAL], SCULPTURE[ID]),
        (PAINTING[PLURAL], PAINTING[ID]),
    ]:
        extracted_artwork = load_wd_entities.extract_artworks(
            artwork, wd, already_crawled_wikidata_items, DEV, DEV_CHUNK_LIMIT)

        path_name = create_new_path(ARTWORK[PLURAL], artwork, CSV)
        generate_csv(extracted_artwork, get_fields(artwork), path_name)

        path_name = create_new_path(ARTWORK[PLURAL], artwork, JSON)
        generate_json(extracted_artwork, path_name)

    merged_artworks = merge_artworks()

    path_name = create_new_path(ARTWORK[PLURAL], file_type=CSV)

    # Get motifs and main subjects
    motifs = load_wd_entities.extract_motifs_and_main_subjects(merged_artworks)
    [motif.update({TYPE: MOTIF[SINGULAR]}) for motif in motifs]
    # Get extracted genres, materials, etc.
    (
        genres,
        materials,
        movements,
        artists,
        locations,
    ) = load_wd_entities.bundle_extract_subjects_calls(
        [
            GENRE[PLURAL],
            MATERIAL[PLURAL],
            MOVEMENT[PLURAL],
            ARTIST[PLURAL],
            LOCATION[PLURAL],
        ],
        merged_artworks,
    )
    print("Total movements after transitive closure loading: ", len(movements))

    for subject, type_name in [
        (genres, GENRE[SINGULAR]),
        (materials, MATERIAL[SINGULAR]),
        (movements, MOVEMENT[SINGULAR]),
        (artists, ARTIST[SINGULAR]),
        (locations, LOCATION[SINGULAR]),
    ]:
        [entity.update({TYPE: type_name}) for entity in subject]

    # Get distinct classes from artworks, motifs, etc.
    extracted_classes = load_wd_entities.get_distinct_extracted_classes(
        merged_artworks,
        motifs,
        genres,
        materials,
        movements,
        artists,
        locations,
    )
    [c.update({TYPE: CLASS[SINGULAR]}) for c in extracted_classes]
    print("Total classes after transitive closure loading: ",
          len(extracted_classes))
    # Get country labels for merged artworks and locations
    (
        locations,
        merged_artworks,
        movements,
    ) = load_wd_entities.get_country_labels_for_merged_artworks_and_locations(
        locations, merged_artworks, movements)

    # Get labels for artists
    artists = load_wd_entities.get_labels_for_artists(
        artists, [GENDER, PLACE_OF_BIRTH, PLACE_OF_DEATH, CITIZENSHIP])

    # Get unit symbols from qid for artworks
    distinct_unit_qids = load_wd_entities.get_distinct_unit_symbol_qids(
        merged_artworks)
    unit_symbols = load_wd_entities.get_unit_symbols(distinct_unit_qids)
    load_wd_entities.resolve_unit_id_to_unit_symbol(merged_artworks,
                                                    unit_symbols)

    # Get exhibition histories as subdict
    merged_artworks = load_wd_entities.resolve_exhibition_ids_to_exhibition_entities(
        merged_artworks)

    # Significant events as subdict
    merged_artworks = load_wd_entities.resolve_significant_event_id_entities_to_labels(
        merged_artworks)

    # Write to JSON
    write_data_to_json_and_csv(
        motifs,
        genres,
        extracted_classes,
        materials,
        movements,
        locations,
        merged_artworks,
        artists,
    )
Пример #9
0
def write_data_to_json_and_csv(
        motifs: List[Dict],
        genres: List[Dict],
        extracted_classes: List[Dict],
        materials: List[Dict],
        movements: List[Dict],
        locations: List[Dict],
        merged_artworks: List[Dict],
        artists: List[Dict],
        classes: List[Dict],
) -> None:
    """Writes the given lists of dictionaries to json and csv files

    Args:
        motifs: List of motifs
        genres: List of genres
        extracted_classes: List of classes
        materials: List of materials
        movements: List of movements
        locations: List of locations
        merged_artworks: List of artworks
        artists: List of artists
        classes: List of classes
    """
    generate_json(motifs, create_new_path(MOTIF[PLURAL]))
    generate_csv(
        motifs,
        get_fields(MOTIF[PLURAL]),
        create_new_path(MOTIF[PLURAL], file_type=CSV),
    )
    generate_json(genres, create_new_path(GENRE[PLURAL]))
    generate_csv(
        genres,
        get_fields(GENRE[PLURAL]),
        create_new_path(GENRE[PLURAL], file_type=CSV),
    )
    generate_json(extracted_classes, create_new_path(EXTRACTED_CLASS[PLURAL]))
    generate_csv(
        extracted_classes,
        get_fields(EXTRACTED_CLASS[PLURAL]),
        create_new_path(EXTRACTED_CLASS[PLURAL], file_type=CSV),
    )
    generate_json(materials, create_new_path(MATERIAL[PLURAL]))
    generate_csv(
        materials,
        get_fields(MATERIAL[PLURAL]),
        create_new_path(MATERIAL[PLURAL], file_type=CSV),
    )
    generate_json(movements, create_new_path(MOVEMENT[PLURAL]))
    generate_csv(
        movements,
        get_fields(MOVEMENT[PLURAL]),
        create_new_path(MOVEMENT[PLURAL], file_type=CSV),
    )
    generate_json(locations, create_new_path(LOCATION[PLURAL]))
    generate_csv(
        locations,
        get_fields(LOCATION[PLURAL]),
        create_new_path(LOCATION[PLURAL], file_type=CSV),
    )
    print(f"writing json of {len(merged_artworks)} artworks")
    generate_json(merged_artworks, create_new_path(ARTWORK[PLURAL]))
    generate_csv(
        merged_artworks,
        get_fields(ARTWORK[PLURAL]),
        create_new_path(ARTWORK[PLURAL], file_type=CSV),
    )
    generate_json(artists, create_new_path(ARTIST[PLURAL]))
    generate_csv(
        artists,
        get_fields(ARTIST[PLURAL]),
        create_new_path(ARTIST[PLURAL], file_type=CSV),
    )
    generate_json(classes, create_new_path(CLASS[PLURAL]))
    generate_csv(
        classes,
        get_fields(CLASS[PLURAL]),
        create_new_path(CLASS[PLURAL], file_type=CSV),
    )
Пример #10
0
def extract_art_ontology() -> None:
    """Extracts *.csv and *.json files for artworks and subjects (e. g. motifs, movements) from wikidata
    """

    # Array of already crawled wikidata items
    already_crawled_wikidata_items = set(BLOCKLIST)

    for source in SOURCE_TYPES if not TEST_MODE else SOURCE_TYPES[:CLASS_LIM]:
        if RECOVER_MODE and check_state(ETL_STATES.GET_WIKIDATA_ITEMS.EXTRACT_SOURCE + source[PLURAL]):
            continue
        extracted_artwork = load_wd_entities.extract_artworks(
            source[PLURAL], source[ID], already_crawled_wikidata_items, DEV, DEV_CHUNK_LIMIT
        )

        path_name = create_new_path(ARTWORK[PLURAL], source[PLURAL], CSV)
        generate_csv(extracted_artwork, get_fields(source[PLURAL]), path_name)

        path_name = create_new_path(ARTWORK[PLURAL], source[PLURAL], JSON)
        generate_json(extracted_artwork, path_name)
        write_state(ETL_STATES.GET_WIKIDATA_ITEMS.EXTRACT_SOURCE + source[PLURAL])

    if RECOVER_MODE and check_state(ETL_STATES.GET_WIKIDATA_ITEMS.MERGED_ARTWORKS):
        return
    merged_artworks = merge_artworks()

    path_name = create_new_path(ARTWORK[PLURAL], file_type=CSV)

    # Get motifs and main subjects
    motifs = load_wd_entities.extract_motifs_and_main_subjects(merged_artworks)
    [motif.update({TYPE: MOTIF[SINGULAR]}) for motif in motifs]
    # Get extracted genres, materials, etc.
    (
        genres,
        materials,
        movements,
        artists,
        locations,
        classes,
    ) = load_wd_entities.bundle_extract_subjects_calls(
        [
            GENRE[PLURAL],
            MATERIAL[PLURAL],
            MOVEMENT[PLURAL],
            ARTIST[PLURAL],
            LOCATION[PLURAL],
            CLASS[PLURAL],
        ],
        merged_artworks,
    )
    print("Total movements after transitive closure loading: ", len(movements))

    for subject, type_name in [
        (genres, GENRE[SINGULAR]),
        (materials, MATERIAL[SINGULAR]),
        (movements, MOVEMENT[SINGULAR]),
        (artists, ARTIST[SINGULAR]),
        (locations, LOCATION[SINGULAR]),
        (classes, CLASS[SINGULAR]),
    ]:
        [entity.update({TYPE: type_name}) for entity in subject]

    # Get distinct classes from artworks, motifs, etc.
    extracted_classes = load_wd_entities.get_distinct_extracted_classes(
        merged_artworks, motifs, genres, materials, movements, artists, locations, classes,
    )
    [c.update({TYPE: CLASS[SINGULAR]}) for c in extracted_classes]

    # Add the "subclass_of" parameter from the extracted_classes to the crawled classes
    existing_classes = []
    for class_itm in classes:
        extracted_class = [d for i, d in enumerate(extracted_classes) if class_itm[ID] in d.values()]
        class_itm.update({SUBCLASS_OF: extracted_class[0][SUBCLASS_OF]}) if len(extracted_class) > 0 else ""
        existing_classes.append(class_itm[ID])

    # append classes that are missing from our first list
    for class_itm in extracted_classes:
        if class_itm[ID] not in existing_classes:
            classes.append(class_itm)

    print("Total classes after transitive closure loading: ", len(classes))
    # Get country labels for merged artworks and locations
    (
        locations,
        merged_artworks,
        movements,
    ) = load_wd_entities.get_country_labels_for_merged_artworks_and_locations(
        locations, merged_artworks, movements
    )

    # Get labels for artists
    artists = load_wd_entities.get_labels_for_artists(
        artists, [GENDER, PLACE_OF_BIRTH, PLACE_OF_DEATH, CITIZENSHIP]
    )

    # Get unit symbols from qid for artworks
    distinct_unit_qids = load_wd_entities.get_distinct_unit_symbol_qids(merged_artworks)
    unit_symbols = load_wd_entities.get_unit_symbols(distinct_unit_qids)
    load_wd_entities.resolve_unit_id_to_unit_symbol(merged_artworks, unit_symbols)

    # Get exhibition histories as subdict
    merged_artworks = load_wd_entities.resolve_exhibition_ids_to_exhibition_entities(
        merged_artworks
    )

    # Significant events as subdict
    merged_artworks = load_wd_entities.resolve_significant_event_id_entities_to_labels(
        merged_artworks
    )

    # Write to JSON
    write_data_to_json_and_csv(
        motifs,
        genres,
        extracted_classes,
        materials,
        movements,
        locations,
        merged_artworks,
        artists,
        classes,
    )
    write_state(ETL_STATES.GET_WIKIDATA_ITEMS.MERGED_ARTWORKS)
Пример #11
0
         MOTIF[PLURAL],  # Main subjects are not considered
         GENRE[PLURAL],
         MATERIAL[PLURAL],
         MOVEMENT[PLURAL],
         ARTIST[PLURAL],
         LOCATION[PLURAL],
         CLASS[PLURAL],
 ]:
     print(
         datetime.datetime.now(),
         "Starting ranking with",
         filename,
     )
     try:
         # Read in file
         with open((create_new_path(filename)).with_suffix(f".{JSON}"),
                   encoding="utf-8") as file:
             if filename is ARTWORK[PLURAL]:
                 artworks = out_file = rank_artworks(json.load(file))
             else:
                 out_file = rank_subjects(filename, json.load(file),
                                          artworks)
         # Overwrite file
         # TODO if merging is done with sth else as js script than overwrite current file
         with open(
             (create_new_path(filename)).with_suffix(f".{JSON}"),
                 "w",
                 newline="",
                 encoding="utf-8",
         ) as file:
             json.dump(out_file, file, ensure_ascii=False)