예제 #1
0
from data_extraction.constants import *
from data_extraction.request_utils import send_http_request
from shared.constants import JSON
from shared.utils import chunks, create_new_path, language_config_to_list, setup_logger, check_state, write_state

RECOVER_MODE = False

logger = setup_logger(
    "data_extraction.get_wikipedia_extracts",
    Path(__file__).parent.parent.absolute()
    / "logs"
    / GET_WIKIPEDIA_EXTRACS_LOG_FILENAME,
)

lang_keys = [item[0] for item in language_config_to_list()]


def get_wikipedia_page_ids(
        items: List[Dict],
        indices: List[int],
        langkey: str,
        timeout: Optional[int] = TIMEOUT,
        sleep_time: Optional[int] = SLEEP_TIME,
        maxlag: Optional[int] = MAX_LAG,
) -> Dict:
    """Function to get the wikipedia page ids from their label referenced in the sitelinks

    https://en.wikipedia.org/w/api.php?action=help&modules=query
    sitelink de: Mona_Lisa is resolved to
예제 #2
0
    COUNTRY,
    CRAWLER_OUTPUT,
    DESCRIPTION,
    GENDER,
    LABEL,
    SINGULAR,
    WIKIPEDIA_LINK,
    PLACE_OF_BIRTH,
    PLACE_OF_DEATH,
    EXHIBITION_HISTORY,
    SIGNIFICANT_EVENT,
    JSON,
)
from shared.utils import generate_json, language_config_to_list

language_values = language_config_to_list()
language_keys = [item[0] for item in language_values]


def get_language_attributes() -> List[str]:
    """Returns all attributes in crawler .csv/.json files that need language handling

    Returns:
        List containing all language attributes
    """
    return [
        LABEL[SINGULAR],
        DESCRIPTION[SINGULAR],
        GENDER,
        CITIZENSHIP,
        COUNTRY,