def load_geonames(filename: str) -> None: """Load geonames data for keys in json file. Args: filename: Filename to load """ # Load file with open( filename, encoding="utf-8", ) as file: placenames = json.load(file) del placenames["$schema"] placenames = cast(TranslationDictCleanedPlacenames, placenames) # Load data for value in placenames.values(): print(f"Looking for data for {value['en_GB']}") geoname = geocoder.geonames(value["en_GB"], key="danielnoord") geoname_details = geocoder.geonames(geoname.geonames_id, key="danielnoord", method="details") value["geonames_id"] = geoname.geonames_id if geoname_details.wikipedia: value[ "geonames_wikipedia"] = f"https://{geoname_details.wikipedia}" else: value["geonames_wikipedia"] = None # Re-add schema placenames["$schema"] = "../static/JSON/Placenames.json" write_single_json_file(placenames, "outputs/Translations", "Placenames.json")
def search_isni_api(database: IndividualsDictCleaned) -> None: """Checks name and surname pairs and sees if they match with ISNI identifiers.""" for data in database.values(): if not data.get("ISNI:id", None): name = f"{data['name']} {data['surname']}".replace(" ", "+") response = requests.get( f"http://isni.oclc.org/sru/?query=pica.nw+%3D+%22{name}%22&operation=searchRetrieve&recordSchema=isni-b&maximumRecords=10" # pylint: disable=line-too-long ) records = list( ElementTree.fromstring(response.content).iter( "{http://www.loc.gov/zing/srw/}record")) if records: print("\n", data["name"], data["surname"]) for record in records: uri = list(record.iter("isniURI"))[0].text try: forename = list(record.iter("forename"))[0].text except IndexError: forename = None try: surname = list(record.iter("surname"))[0].text except IndexError: surname = None print(forename, surname) print(uri) database[ "$schema"] = "../static/JSON/Individuals.json" # type: ignore[assignment] write_single_json_file(database, "outputs", "Individuals.json")
def sort_database(filename: str) -> None: """Sorts the entries in a database. Args: filename: File name of initial database """ with open(filename, encoding="utf-8") as file: persons = json.load(file) del persons["$schema"] for identifier, data in persons.items(): if len(data["sources"]) > 1 and data["sources"] != sorted( data["sources"]): persons[identifier]["sources"] = sorted(data["sources"]) print(f"Sorted sources for {identifier}") if len(data["sources_other"]) > 1 and data["sources_other"] != sorted( data["sources_other"]): persons[identifier]["sources_other"] = sorted( data["sources_other"]) print(f"Sorted sources_other for {identifier}") try: if len(data["titles"]) > 1 and data["titles"] != sorted( data["titles"], key=lambda x: (x[1] is not None, x[1])): persons[identifier]["titles"] = sorted( data["titles"], key=lambda x: (x[1] is not None, x[1])) print(f"Sorted titles for {identifier}") except IndexError as error: raise IndexError( f"Something wrong with the titles of {identifier}. Error: {error}" ) from error try: if len(data["functions"]) > 1 and data["functions"] != sorted( data["functions"], key=lambda x: (x[1] is not None, x[1])): persons[identifier]["functions"] = sorted( data["functions"], key=lambda x: (x[1] is not None, x[1])) print(f"Sorted functions for {identifier}") except IndexError as error: raise IndexError( f"Something wrong with the functions of {identifier}. Error: {error}" ) from error persons["$schema"] = "../static/JSON/Individuals.json" write_single_json_file(persons, "outputs", "Individuals.json")
def update_placenames_with_geonames(filename: str) -> None: """Pull data from geonames and populate our database with it.""" # Load file with open( filename, encoding="utf-8", ) as file: placenames = json.load(file) del placenames["$schema"] placenames = cast(TranslationDictCleanedPlacenames, placenames) # Load data for value in placenames.values(): if value["geonames_id"] is None: raise ValueError(f"{value['en_GB']} doesn't have a Geonames ID") geoname = geocoder.geonames(value["geonames_id"], method="details", key="danielnoord") if geoname.feature_class not in { "P", "T", "H", "S", }: # Places, islands, seas or estates raise ValueError( f"""Geonames ID for {value['en_GB']} is not a place, island or sea. Please check https://www.geonames.org/{value['geonames_id']}""" ) # Populate fields value["latitude"] = geoname.lat value["longitude"] = geoname.lng if geoname.wikipedia: value["geonames_wikipedia"] = f"https://{geoname.wikipedia}" else: value["geonames_wikipedia"] = None # Re-add schema placenames["$schema"] = "../../static/JSON/Placenames.json" write_single_json_file(placenames, "outputs", "Placenames.json")
def check_all_sources(filename: str, ) -> None: """Check and update all sources for given database. Args: filename: File name of initial database """ with open(filename, encoding="utf-8") as file: persons = json.load(file) del persons["$schema"] source_patterns = [] with open("inputs/SourcePatterns.json", encoding="utf-8") as file: source_types = json.load(file) for sources in source_types.values(): source_patterns += sources count_todo = 0 probably_wrong: list[str] = [] compiled_source_patterns = [re.compile(f"{i}$") for i in source_patterns] used_patterns: set[re.Pattern[str]] = set() for identifier, data in persons.items(): ( data["sources"], used_patterns, count_todo, probably_wrong, ) = check_sources_entry( data["sources"], compiled_source_patterns, used_patterns, identifier, count_todo, probably_wrong, ) ( data["sources_other"], used_patterns, count_todo, probably_wrong, ) = check_sources_entry( data["sources_other"], compiled_source_patterns, used_patterns, identifier, count_todo, probably_wrong, ) persons["$schema"] = "../static/JSON/Individuals.json" # Write new file if this file itself is run if __name__ == "__main__": write_single_json_file(persons, "outputs", "Individuals.json") if probably_wrong: print("\nThese sources might be wrong") print( "They have not been added to the list in python/json_check_sources.py" ) print( r"However, that list is awful anyway and is in dire need of updating :')" ) for i in probably_wrong: print("", i) if unused_patterns := [ i for i in compiled_source_patterns if not i in used_patterns ]: print( f"Found the following unused source patterns:\n {unused_patterns}")
def convert_wikidata_to_isni(database: IndividualsDictCleaned) -> None: """Checks wikidata identifiers and sees if they can be converted to ISNI identifiers.""" for data in database.values(): if not data.get("ISNI:id", None) and data.get("wikidata:id", None): wikidata = wdi_core.WDItemEngine( wd_item_id=data["wikidata:id"]).get_wd_json_representation() if isni_data := wikidata["claims"].get("P213", None): data["ISNI:id"] = isni_data[0]["mainsnak"]["datavalue"][ "value"] else: data["ISNI:id"] = None database[ "$schema"] = "../static/JSON/Individuals.json" # type: ignore[assignment] write_single_json_file(database, "outputs", "Individuals.json") def search_isni_api(database: IndividualsDictCleaned) -> None: """Checks name and surname pairs and sees if they match with ISNI identifiers.""" for data in database.values(): if not data.get("ISNI:id", None): name = f"{data['name']} {data['surname']}".replace(" ", "+") response = requests.get( f"http://isni.oclc.org/sru/?query=pica.nw+%3D+%22{name}%22&operation=searchRetrieve&recordSchema=isni-b&maximumRecords=10" # pylint: disable=line-too-long ) records = list( ElementTree.fromstring(response.content).iter( "{http://www.loc.gov/zing/srw/}record"))
def save_database( # pylint: disable=too-many-locals filename: str, previous_database: Optional[IndividualsDict] = None) -> None: """Load database from .docx and write .json. Args: filename: Filename of the input file previous_database: Dict with data from previous database """ doc = docx.Document(filename) all_individuals: IndividualsDict = {} # TODO: This does not work currently and removes fields we are using for para in doc.paragraphs: ( identifier, person_type, surname, name, date_of_birth, place_of_birth, date_of_death, place_of_death, titles, functions, comment, comment_daniel, sources, images, ) = re.split(r"\n.*?: ", para.text) person_type = int(person_type) titles = parse_title(titles) functions = parse_function(functions) sources = sources.replace("\n", "").split("| ") images = images.replace("\n", "").split("| ") all_individuals[identifier] = { # type: ignore[assignment] "surname": surname, "person_type": person_type, "name": name, "date_of_birth": date_of_birth, "place_of_birth": place_of_birth, "date_of_death": date_of_death, "place_of_death": place_of_death, "titles": titles, "functions": functions, "comment": comment, "comment_daniel": comment_daniel, "sources": sources, "images": images, } if previous_database: all_individuals = previous_database | all_individuals # Sort and Schema, shouldn't sort a dict but oh well.. all_individuals = dict( sorted(all_individuals.items(), key=lambda item: item[0])) all_individuals = { "$schema": "../static/JSON/Individuals.json" } | all_individuals write_single_json_file(all_individuals, "outputs", "Individuals.json")