from data_extraction.constants import * from data_extraction.request_utils import send_http_request from shared.constants import JSON from shared.utils import chunks, create_new_path, language_config_to_list, setup_logger, check_state, write_state RECOVER_MODE = False logger = setup_logger( "data_extraction.get_wikipedia_extracts", Path(__file__).parent.parent.absolute() / "logs" / GET_WIKIPEDIA_EXTRACS_LOG_FILENAME, ) lang_keys = [item[0] for item in language_config_to_list()] def get_wikipedia_page_ids( items: List[Dict], indices: List[int], langkey: str, timeout: Optional[int] = TIMEOUT, sleep_time: Optional[int] = SLEEP_TIME, maxlag: Optional[int] = MAX_LAG, ) -> Dict: """Function to get the wikipedia page ids from their label referenced in the sitelinks https://en.wikipedia.org/w/api.php?action=help&modules=query sitelink de: Mona_Lisa is resolved to
COUNTRY, CRAWLER_OUTPUT, DESCRIPTION, GENDER, LABEL, SINGULAR, WIKIPEDIA_LINK, PLACE_OF_BIRTH, PLACE_OF_DEATH, EXHIBITION_HISTORY, SIGNIFICANT_EVENT, JSON, ) from shared.utils import generate_json, language_config_to_list language_values = language_config_to_list() language_keys = [item[0] for item in language_values] def get_language_attributes() -> List[str]: """Returns all attributes in crawler .csv/.json files that need language handling Returns: List containing all language attributes """ return [ LABEL[SINGULAR], DESCRIPTION[SINGULAR], GENDER, CITIZENSHIP, COUNTRY,