Exemplo n.º 1
0
            with open(
                    (create_new_path(filename)).with_suffix(f".{JSON}"),
                    "w",
                    newline="",
                    encoding="utf-8",
            ) as file:
                json.dump(items, file, ensure_ascii=False)
            write_state(ETL_STATES.GET_WIKIPEDIA_EXTRACTS.EXTRACT_ABSTRACTS + filename)

        except Exception as error:
            print(
                f"Error when opening following file: {filename}. Error: {error}. Skipping file now."
            )
            continue
        print(
            datetime.datetime.now(),
            "Finished extracting wikipedia extracts with",
            filename,
        )


if __name__ == "__main__":
    if len(sys.argv) > 1 and "-r" in sys.argv:
        RECOVER_MODE = True

    if RECOVER_MODE and check_state(ETL_STATES.GET_WIKIPEDIA_EXTRACTS.STATE):
        exit(0)
    logger.info("Extracting Wikipedia Abstracts")
    add_wikipedia_extracts()
    write_state(ETL_STATES.GET_WIKIPEDIA_EXTRACTS.STATE)
Exemplo n.º 2
0
def add_wikipedia_extracts(language_keys: Optional[List[str]] = lang_keys, ) -> None:
    """Add the wikipedia extracts to the already existing files

    Args:
        language_keys: Language keys to extract wikipedia abstracts for. Defaults to languageconfig.csv
    """
    for filename in [
        ARTWORK[PLURAL],
        MOTIF[PLURAL],
        GENRE[PLURAL],
        MATERIAL[PLURAL],
        MOVEMENT[PLURAL],
        ARTIST[PLURAL],
        LOCATION[PLURAL],
        CLASS[PLURAL],
    ]:
        print(
            datetime.datetime.now(),
            "Starting extracting wikipedia extracts with",
            filename,
        )
        try:
            with open(
                    (create_new_path(filename)).with_suffix(f".{JSON}"), encoding="utf-8"
            ) as file:
                if RECOVER_MODE and check_state(ETL_STATES.GET_WIKIPEDIA_EXTRACTS.EXTRACT_ABSTRACTS + filename):
                    continue
                items = json.load(file)
                for key in language_keys:
                    item_indices_with_wiki_link_for_lang = [
                        items.index(item)
                        for item in items
                        if item[f"{WIKIPEDIA_LINK}_{key}"] != ""
                    ]
                    print(
                        f"There are {len(item_indices_with_wiki_link_for_lang)} {key}.wikipedia links within the {len(items)} {filename} items"
                    )

                    # retry operation until its done
                    done = False
                    # ToDo: The limit for extracts seems to be 20, there is an excontinue parameter which
                    # could be used to increase the performance and load more at once (50 is allowed by the API) if needed
                    # The request method has to be adjusted for this
                    # Further information https://stackoverflow.com/questions/9846795/prop-extracts-not-returning-all-extracts-in-the-wikimedia-api
                    chunk_size = 20

                    while not done:
                        try:
                            item_indices_chunks = chunks(
                                item_indices_with_wiki_link_for_lang, chunk_size
                            )
                            extracted_count = 0
                            # Fill json objects without wikilink to an abstract with empty key-value pairs (could be removed if frontend is adjusted)
                            for j in range(len(items)):
                                if j not in item_indices_with_wiki_link_for_lang:
                                    items[j][f"{ABSTRACT}_{key}"] = ""

                            for chunk in item_indices_chunks:
                                # Get PageIds from URL https://en.wikipedia.org/w/api.php?action=query&titles=Jean_Wauquelin_presenting_his_'Chroniques_de_Hainaut'_to_Philip_the_Good
                                page_id_indices_dictionary = get_wikipedia_page_ids(
                                    items, chunk, key
                                )
                                # Get Extracts from PageId https://en.wikipedia.org/w/api.php?format=json&action=query&prop=extracts&exintro&explaintext&pageids=70889|1115370
                                raw_response = get_wikipedia_extracts(
                                    items, page_id_indices_dictionary, key
                                )
                                # add extracted abstracts to json objects
                                for i in chunk:
                                    items[i][f"{ABSTRACT}_{key}"] = raw_response[i]

                                extracted_count += len(chunk)
                                print(
                                    f"Extracts for {filename} and language {key} status: {extracted_count}/{len(item_indices_with_wiki_link_for_lang)}",
                                    end="\r",
                                    flush=True,
                                )

                                # If a chunk is finished and the chunk size is < 20 (e.g. the previous chunk failed but the current one succeeded): increase the chunk size
                                chunk_size = chunk_size + 5 if chunk_size < 20 else chunk_size


                            # set done to true after all items have been processed
                            done = True
                        except Exception as error:
                            logger.error(f"Fetching wikipedia extracs for {filename}, lang:{key} and chunk size:{chunk_size} failed!")
                            logger.error(error)

                            # lower chunk size and try again in while loop
                            chunk_size -= 5
                            if chunk_size > 0:
                                logger.info(f"Trying the wikipedia extracts again with chunk size:{chunk_size}")
                                continue
                            else:
                                logger.exception(error)
                                raise error

            # overwrite file
            with open(
                    (create_new_path(filename)).with_suffix(f".{JSON}"),
                    "w",
                    newline="",
                    encoding="utf-8",
            ) as file:
                json.dump(items, file, ensure_ascii=False)
            write_state(ETL_STATES.GET_WIKIPEDIA_EXTRACTS.EXTRACT_ABSTRACTS + filename)

        except Exception as error:
            print(
                f"Error when opening following file: {filename}. Error: {error}. Skipping file now."
            )
            continue
        print(
            datetime.datetime.now(),
            "Finished extracting wikipedia extracts with",
            filename,
        )
                else:
                    inceptions[art_mov] = {
                        "start": artwork["inception"],
                        "end": artwork["inception"],
                    }
    return


if __name__ == "__main__":
    """Gets for all movements all first and last inceptions from the artworks file and
       sets them to each movement if start_time/end_time is missing.
    """
    if len(sys.argv) > 1 and "-r" in sys.argv:
        RECOVER_MODE = True

    if RECOVER_MODE and check_state(
            ETL_STATES.DATA_TRANSFORMATION.ESTIMATE_MOVEMENT_PERIOD):
        exit(0)

    # start = datetime.now()
    movements_file = (Path(__file__).resolve().parent.parent /
                      "crawler_output" / "intermediate_files" / "json" /
                      "movements.json")
    artworks_file = (Path(__file__).resolve().parent.parent /
                     "crawler_output" / "intermediate_files" / "json" /
                     "artworks.json")
    movements_output_file = movements_file  # Change here for different output file
    movements_list = ijson.items(open(movements_file, "r", encoding="utf-8"),
                                 "item")

    find_start_end_in_artworks(artworks_file)
            for qid in movement[attribute]:
                # Add the current movement id to the inverse attribute list
                if (movement[ID] not in movements[qid_index_dict[qid]]
                    [inverse_attribute]):
                    movements[qid_index_dict[qid]][inverse_attribute].append(
                        movement[ID])

    return movements


if __name__ == "__main__":

    if len(sys.argv) > 1 and "-r" in sys.argv:
        RECOVER_MODE = True

    if RECOVER_MODE and check_state(
            ETL_STATES.DATA_TRANSFORMATION.HAS_PART_PART_OF_ENHANCEMENT):
        exit(0)

    print("Starting part of, has part enhancement on movements",
          datetime.datetime.now())
    movements_file = (Path(__file__).resolve().parent.parent /
                      "crawler_output" / "intermediate_files" / "json" /
                      "movements.json")

    with open(movements_file, encoding="utf-8") as file:
        movements = json.load(file)
        movements = inverse_attribute_enhancement(HAS_PART, PART_OF, movements)
        movements = inverse_attribute_enhancement(PART_OF, HAS_PART, movements)

    with open(movements_file, "w", newline="", encoding="utf-8") as file:
        file.write(json.dumps(movements, ensure_ascii=False))
    for entity in language_file:
        if EXHIBITION_HISTORY in entity and entity[EXHIBITION_HISTORY]:
            for attribute in [LABEL[SINGULAR], DESCRIPTION[SINGULAR]]:
                for key in lang_keys:
                    for exhibition in entity[EXHIBITION_HISTORY]:
                        if f"{attribute}_{key}" in exhibition:
                            del exhibition[f"{attribute}_{key}"]

    return language_file


if __name__ == "__main__":
    if len(sys.argv) > 1 and "-r" in sys.argv:
        RECOVER_MODE = True
    if RECOVER_MODE and check_state(
            ETL_STATES.DATA_TRANSFORMATION.SPLIT_LANGUAGES,
            Path(__file__).parent.parent):
        exit(0)
    art_ontology_file = (Path(__file__).resolve().parent.parent /
                         CRAWLER_OUTPUT / "art_ontology.json")
    print(
        datetime.datetime.now(),
        "Starting with splitting art_ontology.json to its language files",
    )

    for lang_key in language_keys:
        with open(art_ontology_file, encoding="utf-8") as json_file:
            art_ontology = ijson.items(json_file, 'item')
            art_ontology_for_lang = []
            print(f"Start generating art_ontology_{lang_key}.{JSON}")
            for item in art_ontology:
Exemplo n.º 6
0
def extract_art_ontology() -> None:
    """Extracts *.csv and *.json files for artworks and subjects (e. g. motifs, movements) from wikidata
    """

    # Array of already crawled wikidata items
    already_crawled_wikidata_items = set(BLOCKLIST)

    for source in SOURCE_TYPES if not TEST_MODE else SOURCE_TYPES[:CLASS_LIM]:
        if RECOVER_MODE and check_state(ETL_STATES.GET_WIKIDATA_ITEMS.EXTRACT_SOURCE + source[PLURAL]):
            continue
        extracted_artwork = load_wd_entities.extract_artworks(
            source[PLURAL], source[ID], already_crawled_wikidata_items, DEV, DEV_CHUNK_LIMIT
        )

        path_name = create_new_path(ARTWORK[PLURAL], source[PLURAL], CSV)
        generate_csv(extracted_artwork, get_fields(source[PLURAL]), path_name)

        path_name = create_new_path(ARTWORK[PLURAL], source[PLURAL], JSON)
        generate_json(extracted_artwork, path_name)
        write_state(ETL_STATES.GET_WIKIDATA_ITEMS.EXTRACT_SOURCE + source[PLURAL])

    if RECOVER_MODE and check_state(ETL_STATES.GET_WIKIDATA_ITEMS.MERGED_ARTWORKS):
        return
    merged_artworks = merge_artworks()

    path_name = create_new_path(ARTWORK[PLURAL], file_type=CSV)

    # Get motifs and main subjects
    motifs = load_wd_entities.extract_motifs_and_main_subjects(merged_artworks)
    [motif.update({TYPE: MOTIF[SINGULAR]}) for motif in motifs]
    # Get extracted genres, materials, etc.
    (
        genres,
        materials,
        movements,
        artists,
        locations,
        classes,
    ) = load_wd_entities.bundle_extract_subjects_calls(
        [
            GENRE[PLURAL],
            MATERIAL[PLURAL],
            MOVEMENT[PLURAL],
            ARTIST[PLURAL],
            LOCATION[PLURAL],
            CLASS[PLURAL],
        ],
        merged_artworks,
    )
    print("Total movements after transitive closure loading: ", len(movements))

    for subject, type_name in [
        (genres, GENRE[SINGULAR]),
        (materials, MATERIAL[SINGULAR]),
        (movements, MOVEMENT[SINGULAR]),
        (artists, ARTIST[SINGULAR]),
        (locations, LOCATION[SINGULAR]),
        (classes, CLASS[SINGULAR]),
    ]:
        [entity.update({TYPE: type_name}) for entity in subject]

    # Get distinct classes from artworks, motifs, etc.
    extracted_classes = load_wd_entities.get_distinct_extracted_classes(
        merged_artworks, motifs, genres, materials, movements, artists, locations, classes,
    )
    [c.update({TYPE: CLASS[SINGULAR]}) for c in extracted_classes]

    # Add the "subclass_of" parameter from the extracted_classes to the crawled classes
    existing_classes = []
    for class_itm in classes:
        extracted_class = [d for i, d in enumerate(extracted_classes) if class_itm[ID] in d.values()]
        class_itm.update({SUBCLASS_OF: extracted_class[0][SUBCLASS_OF]}) if len(extracted_class) > 0 else ""
        existing_classes.append(class_itm[ID])

    # append classes that are missing from our first list
    for class_itm in extracted_classes:
        if class_itm[ID] not in existing_classes:
            classes.append(class_itm)

    print("Total classes after transitive closure loading: ", len(classes))
    # Get country labels for merged artworks and locations
    (
        locations,
        merged_artworks,
        movements,
    ) = load_wd_entities.get_country_labels_for_merged_artworks_and_locations(
        locations, merged_artworks, movements
    )

    # Get labels for artists
    artists = load_wd_entities.get_labels_for_artists(
        artists, [GENDER, PLACE_OF_BIRTH, PLACE_OF_DEATH, CITIZENSHIP]
    )

    # Get unit symbols from qid for artworks
    distinct_unit_qids = load_wd_entities.get_distinct_unit_symbol_qids(merged_artworks)
    unit_symbols = load_wd_entities.get_unit_symbols(distinct_unit_qids)
    load_wd_entities.resolve_unit_id_to_unit_symbol(merged_artworks, unit_symbols)

    # Get exhibition histories as subdict
    merged_artworks = load_wd_entities.resolve_exhibition_ids_to_exhibition_entities(
        merged_artworks
    )

    # Significant events as subdict
    merged_artworks = load_wd_entities.resolve_significant_event_id_entities_to_labels(
        merged_artworks
    )

    # Write to JSON
    write_data_to_json_and_csv(
        motifs,
        genres,
        extracted_classes,
        materials,
        movements,
        locations,
        merged_artworks,
        artists,
        classes,
    )
    write_state(ETL_STATES.GET_WIKIDATA_ITEMS.MERGED_ARTWORKS)
Exemplo n.º 7
0
        classes,
    )
    write_state(ETL_STATES.GET_WIKIDATA_ITEMS.MERGED_ARTWORKS)


if __name__ == "__main__":
    if len(sys.argv) > 1:
        print(sys.argv)
        dev_count_set = False
        if "-d" in sys.argv:
            if len(sys.argv) > sys.argv.index('-d') + 1 and any([c.isdigit() for c in sys.argv]):
                DEV_CHUNK_LIMIT = int(sys.argv[sys.argv.index('-d') + 1])
                dev_count_set = True
            print("DEV MODE: on, DEV_LIM={0}".format(DEV_CHUNK_LIMIT))
            DEV = True
        if "-r" in sys.argv:
            RECOVER_MODE = True
        if "-t" in sys.argv:
            if len(sys.argv) > sys.argv.index('-t') + 1 and sys.argv[sys.argv.index('-t') + 1].isdigit():
                CLASS_LIM = int(sys.argv[sys.argv.index('-t') + 1])
            print("TEST MODE: on, CLASS_LIM={0}".format(CLASS_LIM))
            TEST_MODE = True
            DEV = True
            if not dev_count_set:
                DEV_CHUNK_LIMIT = 3
    if RECOVER_MODE and check_state(ETL_STATES.GET_WIKIDATA_ITEMS.STATE):
        exit(0)
    logger.info("Extracting Art Ontology")
    extract_art_ontology()
    write_state(ETL_STATES.GET_WIKIDATA_ITEMS.STATE)
    entries_added_count = 0
    for entity in entities:
        if entity[ID] in videos:
            entries_added_count += 1
            entity[VIDEOS] = videos[entity[ID]]

    print("Added videos for {} entries. Saving the file..".format(
        entries_added_count))

    return entities


if __name__ == "__main__":
    if len(sys.argv) > 1 and "-r" in sys.argv:
        RECOVER_MODE = True
    if RECOVER_MODE and check_state(
            ETL_STATES.DATA_TRANSFORMATION.ADD_YOUTUBE_VIDEOS):
        exit(0)
    check = "-c" in sys.argv
    for entity_type in [ARTWORK[PLURAL], ARTIST[PLURAL], MOVEMENT[PLURAL]]:
        print(
            datetime.datetime.now(),
            f"Starting with adding youtube videos for file: {entity_type}",
        )
        try:
            # Open file
            with open((create_new_path(entity_type)).with_suffix(f".{JSON}"),
                      encoding="utf-8") as file:
                items = json.load(file)

            entities = add_youtube_videos(items, check_ids=check)
    return calc_relative_rank(subjects)


def calc_relative_rank(entities: List[Dict]) -> List[Dict]:
    sorted_entities = sorted(entities, key=lambda k: k[ABSOLUTE_RANK])
    for i, entity in enumerate(sorted_entities):
        entity.update({RELATIVE_RANK: float(i / len(entities))})

    return sorted_entities


if __name__ == "__main__":
    if len(sys.argv) > 1 and "-r" in sys.argv:
        RECOVER_MODE = True
    if RECOVER_MODE and check_state(ETL_STATES.DATA_TRANSFORMATION.RANKING):
        exit(0)
    artworks = []
    for filename in [
            ARTWORK[
                PLURAL],  # Artworks has to be first otherwise the ranking doesn't work
            MOTIF[PLURAL],  # Main subjects are not considered
            GENRE[PLURAL],
            MATERIAL[PLURAL],
            MOVEMENT[PLURAL],
            ARTIST[PLURAL],
            LOCATION[PLURAL],
            CLASS[PLURAL],
    ]:
        print(
            datetime.datetime.now(),