Python write_state示例，shared.utils.write_state Python示例

示例#1

0

显示文件

            with open(
                    (create_new_path(filename)).with_suffix(f".{JSON}"),
                    "w",
                    newline="",
                    encoding="utf-8",
            ) as file:
                json.dump(items, file, ensure_ascii=False)
            write_state(ETL_STATES.GET_WIKIPEDIA_EXTRACTS.EXTRACT_ABSTRACTS + filename)

        except Exception as error:
            print(
                f"Error when opening following file: {filename}. Error: {error}. Skipping file now."
            )
            continue
        print(
            datetime.datetime.now(),
            "Finished extracting wikipedia extracts with",
            filename,
        )


if __name__ == "__main__":
    if len(sys.argv) > 1 and "-r" in sys.argv:
        RECOVER_MODE = True

    if RECOVER_MODE and check_state(ETL_STATES.GET_WIKIPEDIA_EXTRACTS.STATE):
        exit(0)
    logger.info("Extracting Wikipedia Abstracts")
    add_wikipedia_extracts()
    write_state(ETL_STATES.GET_WIKIPEDIA_EXTRACTS.STATE)

示例#2

0

显示文件

def add_wikipedia_extracts(language_keys: Optional[List[str]] = lang_keys, ) -> None:
    """Add the wikipedia extracts to the already existing files

    Args:
        language_keys: Language keys to extract wikipedia abstracts for. Defaults to languageconfig.csv
    """
    for filename in [
        ARTWORK[PLURAL],
        MOTIF[PLURAL],
        GENRE[PLURAL],
        MATERIAL[PLURAL],
        MOVEMENT[PLURAL],
        ARTIST[PLURAL],
        LOCATION[PLURAL],
        CLASS[PLURAL],
    ]:
        print(
            datetime.datetime.now(),
            "Starting extracting wikipedia extracts with",
            filename,
        )
        try:
            with open(
                    (create_new_path(filename)).with_suffix(f".{JSON}"), encoding="utf-8"
            ) as file:
                if RECOVER_MODE and check_state(ETL_STATES.GET_WIKIPEDIA_EXTRACTS.EXTRACT_ABSTRACTS + filename):
                    continue
                items = json.load(file)
                for key in language_keys:
                    item_indices_with_wiki_link_for_lang = [
                        items.index(item)
                        for item in items
                        if item[f"{WIKIPEDIA_LINK}_{key}"] != ""
                    ]
                    print(
                        f"There are {len(item_indices_with_wiki_link_for_lang)} {key}.wikipedia links within the {len(items)} {filename} items"
                    )

                    # retry operation until its done
                    done = False
                    # ToDo: The limit for extracts seems to be 20, there is an excontinue parameter which
                    # could be used to increase the performance and load more at once (50 is allowed by the API) if needed
                    # The request method has to be adjusted for this
                    # Further information https://stackoverflow.com/questions/9846795/prop-extracts-not-returning-all-extracts-in-the-wikimedia-api
                    chunk_size = 20

                    while not done:
                        try:
                            item_indices_chunks = chunks(
                                item_indices_with_wiki_link_for_lang, chunk_size
                            )
                            extracted_count = 0
                            # Fill json objects without wikilink to an abstract with empty key-value pairs (could be removed if frontend is adjusted)
                            for j in range(len(items)):
                                if j not in item_indices_with_wiki_link_for_lang:
                                    items[j][f"{ABSTRACT}_{key}"] = ""

                            for chunk in item_indices_chunks:
                                # Get PageIds from URL https://en.wikipedia.org/w/api.php?action=query&titles=Jean_Wauquelin_presenting_his_'Chroniques_de_Hainaut'_to_Philip_the_Good
                                page_id_indices_dictionary = get_wikipedia_page_ids(
                                    items, chunk, key
                                )
                                # Get Extracts from PageId https://en.wikipedia.org/w/api.php?format=json&action=query&prop=extracts&exintro&explaintext&pageids=70889|1115370
                                raw_response = get_wikipedia_extracts(
                                    items, page_id_indices_dictionary, key
                                )
                                # add extracted abstracts to json objects
                                for i in chunk:
                                    items[i][f"{ABSTRACT}_{key}"] = raw_response[i]

                                extracted_count += len(chunk)
                                print(
                                    f"Extracts for {filename} and language {key} status: {extracted_count}/{len(item_indices_with_wiki_link_for_lang)}",
                                    end="\r",
                                    flush=True,
                                )

                                # If a chunk is finished and the chunk size is < 20 (e.g. the previous chunk failed but the current one succeeded): increase the chunk size
                                chunk_size = chunk_size + 5 if chunk_size < 20 else chunk_size


                            # set done to true after all items have been processed
                            done = True
                        except Exception as error:
                            logger.error(f"Fetching wikipedia extracs for {filename}, lang:{key} and chunk size:{chunk_size} failed!")
                            logger.error(error)

                            # lower chunk size and try again in while loop
                            chunk_size -= 5
                            if chunk_size > 0:
                                logger.info(f"Trying the wikipedia extracts again with chunk size:{chunk_size}")
                                continue
                            else:
                                logger.exception(error)
                                raise error

            # overwrite file
            with open(
                    (create_new_path(filename)).with_suffix(f".{JSON}"),
                    "w",
                    newline="",
                    encoding="utf-8",
            ) as file:
                json.dump(items, file, ensure_ascii=False)
            write_state(ETL_STATES.GET_WIKIPEDIA_EXTRACTS.EXTRACT_ABSTRACTS + filename)

        except Exception as error:
            print(
                f"Error when opening following file: {filename}. Error: {error}. Skipping file now."
            )
            continue
        print(
            datetime.datetime.now(),
            "Finished extracting wikipedia extracts with",
            filename,
        )

示例#3

0

显示文件

文件： estimate_movement_period.py 项目： hochschule-darmstadt/openartbrowser

                      and inceptions[movement["id"]]["start"] <
                      movement["start_time"]):
                movement["start_time_est"] = inceptions[
                    movement["id"]]["start"]
            else:
                movement["start_time_est"] = ""

            if (not movement["end_time"] and
                (not movement["start_time"]
                 or inceptions[movement["id"]]["end"] > movement["start_time"])
                ) or (movement["end_time"]
                      and inceptions[movement["id"]]["end"] >
                      movement["end_time"]):
                movement["end_time_est"] = inceptions[movement["id"]]["end"]
            else:
                movement["end_time_est"] = ""
        else:
            movement["start_time_est"] = ""
            movement["end_time_est"] = ""
        movements_modified.append(movement)

    with open(movements_output_file, "w", newline="",
              encoding="utf-8") as file:
        json.dump(movements_modified,
                  file,
                  ensure_ascii=False,
                  cls=DecimalEncoder)
    write_state(ETL_STATES.DATA_TRANSFORMATION.ESTIMATE_MOVEMENT_PERIOD)

    # print('took ', datetime.now() - start)

示例#4

0

显示文件

文件： has_part_part_of_enhancement.py 项目： hochschule-darmstadt/openartbrowser

    return movements


if __name__ == "__main__":

    if len(sys.argv) > 1 and "-r" in sys.argv:
        RECOVER_MODE = True

    if RECOVER_MODE and check_state(
            ETL_STATES.DATA_TRANSFORMATION.HAS_PART_PART_OF_ENHANCEMENT):
        exit(0)

    print("Starting part of, has part enhancement on movements",
          datetime.datetime.now())
    movements_file = (Path(__file__).resolve().parent.parent /
                      "crawler_output" / "intermediate_files" / "json" /
                      "movements.json")

    with open(movements_file, encoding="utf-8") as file:
        movements = json.load(file)
        movements = inverse_attribute_enhancement(HAS_PART, PART_OF, movements)
        movements = inverse_attribute_enhancement(PART_OF, HAS_PART, movements)

    with open(movements_file, "w", newline="", encoding="utf-8") as file:
        file.write(json.dumps(movements, ensure_ascii=False))

    print("Finished part of, has part enhancement on movements",
          datetime.datetime.now())
    write_state(ETL_STATES.DATA_TRANSFORMATION.HAS_PART_PART_OF_ENHANCEMENT)

示例#5

0

显示文件

文件： split_languages.py 项目： hochschule-darmstadt/openartbrowser

        "Starting with splitting art_ontology.json to its language files",
    )

    for lang_key in language_keys:
        with open(art_ontology_file, encoding="utf-8") as json_file:
            art_ontology = ijson.items(json_file, 'item')
            art_ontology_for_lang = []
            print(f"Start generating art_ontology_{lang_key}.{JSON}")
            for item in art_ontology:
                if is_jsonable(item):
                    art_ontology_for_lang.append(
                        modify_langdict(item, lang_key))

            art_ontology_for_lang = remove_language_key_attributes_in_exhibitions(
                art_ontology_for_lang)

            generate_json(
                art_ontology_for_lang,
                Path(__file__).resolve().parent.parent / CRAWLER_OUTPUT /
                f"art_ontology_{lang_key}",
            )
            json_file.close()
            print(f"Finished generating art_ontology_{lang_key}.{JSON}")

    print(
        datetime.datetime.now(),
        "Finished with splitting art_ontology.json to its language files",
    )
    write_state(ETL_STATES.DATA_TRANSFORMATION.SPLIT_LANGUAGES,
                Path(__file__).parent.parent)

示例#6

0

显示文件

def extract_art_ontology() -> None:
    """Extracts *.csv and *.json files for artworks and subjects (e. g. motifs, movements) from wikidata
    """

    # Array of already crawled wikidata items
    already_crawled_wikidata_items = set(BLOCKLIST)

    for source in SOURCE_TYPES if not TEST_MODE else SOURCE_TYPES[:CLASS_LIM]:
        if RECOVER_MODE and check_state(ETL_STATES.GET_WIKIDATA_ITEMS.EXTRACT_SOURCE + source[PLURAL]):
            continue
        extracted_artwork = load_wd_entities.extract_artworks(
            source[PLURAL], source[ID], already_crawled_wikidata_items, DEV, DEV_CHUNK_LIMIT
        )

        path_name = create_new_path(ARTWORK[PLURAL], source[PLURAL], CSV)
        generate_csv(extracted_artwork, get_fields(source[PLURAL]), path_name)

        path_name = create_new_path(ARTWORK[PLURAL], source[PLURAL], JSON)
        generate_json(extracted_artwork, path_name)
        write_state(ETL_STATES.GET_WIKIDATA_ITEMS.EXTRACT_SOURCE + source[PLURAL])

    if RECOVER_MODE and check_state(ETL_STATES.GET_WIKIDATA_ITEMS.MERGED_ARTWORKS):
        return
    merged_artworks = merge_artworks()

    path_name = create_new_path(ARTWORK[PLURAL], file_type=CSV)

    # Get motifs and main subjects
    motifs = load_wd_entities.extract_motifs_and_main_subjects(merged_artworks)
    [motif.update({TYPE: MOTIF[SINGULAR]}) for motif in motifs]
    # Get extracted genres, materials, etc.
    (
        genres,
        materials,
        movements,
        artists,
        locations,
        classes,
    ) = load_wd_entities.bundle_extract_subjects_calls(
        [
            GENRE[PLURAL],
            MATERIAL[PLURAL],
            MOVEMENT[PLURAL],
            ARTIST[PLURAL],
            LOCATION[PLURAL],
            CLASS[PLURAL],
        ],
        merged_artworks,
    )
    print("Total movements after transitive closure loading: ", len(movements))

    for subject, type_name in [
        (genres, GENRE[SINGULAR]),
        (materials, MATERIAL[SINGULAR]),
        (movements, MOVEMENT[SINGULAR]),
        (artists, ARTIST[SINGULAR]),
        (locations, LOCATION[SINGULAR]),
        (classes, CLASS[SINGULAR]),
    ]:
        [entity.update({TYPE: type_name}) for entity in subject]

    # Get distinct classes from artworks, motifs, etc.
    extracted_classes = load_wd_entities.get_distinct_extracted_classes(
        merged_artworks, motifs, genres, materials, movements, artists, locations, classes,
    )
    [c.update({TYPE: CLASS[SINGULAR]}) for c in extracted_classes]

    # Add the "subclass_of" parameter from the extracted_classes to the crawled classes
    existing_classes = []
    for class_itm in classes:
        extracted_class = [d for i, d in enumerate(extracted_classes) if class_itm[ID] in d.values()]
        class_itm.update({SUBCLASS_OF: extracted_class[0][SUBCLASS_OF]}) if len(extracted_class) > 0 else ""
        existing_classes.append(class_itm[ID])

    # append classes that are missing from our first list
    for class_itm in extracted_classes:
        if class_itm[ID] not in existing_classes:
            classes.append(class_itm)

    print("Total classes after transitive closure loading: ", len(classes))
    # Get country labels for merged artworks and locations
    (
        locations,
        merged_artworks,
        movements,
    ) = load_wd_entities.get_country_labels_for_merged_artworks_and_locations(
        locations, merged_artworks, movements
    )

    # Get labels for artists
    artists = load_wd_entities.get_labels_for_artists(
        artists, [GENDER, PLACE_OF_BIRTH, PLACE_OF_DEATH, CITIZENSHIP]
    )

    # Get unit symbols from qid for artworks
    distinct_unit_qids = load_wd_entities.get_distinct_unit_symbol_qids(merged_artworks)
    unit_symbols = load_wd_entities.get_unit_symbols(distinct_unit_qids)
    load_wd_entities.resolve_unit_id_to_unit_symbol(merged_artworks, unit_symbols)

    # Get exhibition histories as subdict
    merged_artworks = load_wd_entities.resolve_exhibition_ids_to_exhibition_entities(
        merged_artworks
    )

    # Significant events as subdict
    merged_artworks = load_wd_entities.resolve_significant_event_id_entities_to_labels(
        merged_artworks
    )

    # Write to JSON
    write_data_to_json_and_csv(
        motifs,
        genres,
        extracted_classes,
        materials,
        movements,
        locations,
        merged_artworks,
        artists,
        classes,
    )
    write_state(ETL_STATES.GET_WIKIDATA_ITEMS.MERGED_ARTWORKS)

示例#7

0

显示文件

        classes,
    )
    write_state(ETL_STATES.GET_WIKIDATA_ITEMS.MERGED_ARTWORKS)


if __name__ == "__main__":
    if len(sys.argv) > 1:
        print(sys.argv)
        dev_count_set = False
        if "-d" in sys.argv:
            if len(sys.argv) > sys.argv.index('-d') + 1 and any([c.isdigit() for c in sys.argv]):
                DEV_CHUNK_LIMIT = int(sys.argv[sys.argv.index('-d') + 1])
                dev_count_set = True
            print("DEV MODE: on, DEV_LIM={0}".format(DEV_CHUNK_LIMIT))
            DEV = True
        if "-r" in sys.argv:
            RECOVER_MODE = True
        if "-t" in sys.argv:
            if len(sys.argv) > sys.argv.index('-t') + 1 and sys.argv[sys.argv.index('-t') + 1].isdigit():
                CLASS_LIM = int(sys.argv[sys.argv.index('-t') + 1])
            print("TEST MODE: on, CLASS_LIM={0}".format(CLASS_LIM))
            TEST_MODE = True
            DEV = True
            if not dev_count_set:
                DEV_CHUNK_LIMIT = 3
    if RECOVER_MODE and check_state(ETL_STATES.GET_WIKIDATA_ITEMS.STATE):
        exit(0)
    logger.info("Extracting Art Ontology")
    extract_art_ontology()
    write_state(ETL_STATES.GET_WIKIDATA_ITEMS.STATE)

示例#8

0

显示文件

文件： add_youtube_videos.py 项目： hochschule-darmstadt/openartbrowser

            datetime.datetime.now(),
            f"Starting with adding youtube videos for file: {entity_type}",
        )
        try:
            # Open file
            with open((create_new_path(entity_type)).with_suffix(f".{JSON}"),
                      encoding="utf-8") as file:
                items = json.load(file)

            entities = add_youtube_videos(items, check_ids=check)

            # Overwrite file
            with open(
                (create_new_path(entity_type)).with_suffix(f".{JSON}"),
                    "w",
                    newline="",
                    encoding="utf-8",
            ) as file:
                json.dump(entities, file, ensure_ascii=False)
        except Exception as error:
            logger.error(
                f"Error when opening following file: {entity_type}. Skipping file now.\nError:"
            )
            logger.exception(error)
            continue
        print(
            datetime.datetime.now(),
            f"Finished adding youtube videos for file: {entity_type}",
        )
    write_state(ETL_STATES.DATA_TRANSFORMATION.ADD_YOUTUBE_VIDEOS)

示例#9

0

显示文件

文件： ranking.py 项目： hochschule-darmstadt/openartbrowser

        try:
            # Read in file
            with open((create_new_path(filename)).with_suffix(f".{JSON}"),
                      encoding="utf-8") as file:
                if filename is ARTWORK[PLURAL]:
                    artworks = out_file = rank_artworks(json.load(file))
                else:
                    out_file = rank_subjects(filename, json.load(file),
                                             artworks)
            # Overwrite file
            # TODO if merging is done with sth else as js script than overwrite current file
            with open(
                (create_new_path(filename)).with_suffix(f".{JSON}"),
                    "w",
                    newline="",
                    encoding="utf-8",
            ) as file:
                json.dump(out_file, file, ensure_ascii=False)
            print(
                datetime.datetime.now(),
                "Finished ranking with",
                filename,
            )
        except Exception as error:
            logger.error(
                f"Error when opening following file: {filename}. Skipping file now.\nError:"
            )
            logger.exception(error)
            continue
    write_state(ETL_STATES.DATA_TRANSFORMATION.RANKING)