def _scrape_once(data, config: Config) -> Iterator[WordPronPair]: session = requests_html.HTMLSession() for member in data["query"]["categorymembers"]: title = member["title"] timestamp = member["timestamp"] config.restart_key = member["sortkey"] if _skip_word(title, config.skip_spaces_word) or _skip_date( timestamp, config.cut_off_date): continue request = session.get(_PAGE_TEMPLATE.format(word=title), timeout=10, headers=HTTP_HEADERS) for word, pron in config.extract_word_pron(title, request, config): # Pronunciation processing is done in NFD-space; # we convert back to NFC afterwards. normalized_pron = unicodedata.normalize("NFC", pron) # 'cast' is required 'normalize' doesn't return a 'Pron' yield word, cast(Pron, normalized_pron)
def _scrape_once(data, config: Config) -> Iterator[WordPronPair]: session = requests_html.HTMLSession() for member in data["query"]["categorymembers"]: word = member["title"] date = member["timestamp"] if _skip_word(word) or _skip_date(date, config.cut_off_date): continue request = session.get(_PAGE_TEMPLATE.format(word=word), timeout=10) for word, pron in config.extract_word_pron(word, request, config): yield word, pron
def _scrape_once(data, config: Config) -> Iterator[WordPronPair]: session = requests_html.HTMLSession() for member in data["query"]["categorymembers"]: word = member["title"] date = member["timestamp"] if _skip_word(word, config.no_skip_spaces_word) or _skip_date( date, config.cut_off_date ): continue request = session.get(_PAGE_TEMPLATE.format(word=word), timeout=10) for word, pron in config.extract_word_pron(word, request, config): # Pronunciation processing is done in NFD-space; # we convert back to NFC aftewards. yield word, unicodedata.normalize("NFC", pron)
def main() -> None: logging.basicConfig(format="%(levelname)s: %(message)s", level="INFO") args = _get_cli_args(sys.argv[1:]) config = Config(**args.__dict__) _scrape_and_write(config)
def config_factory(**kwargs) -> Config: """Create a Config object for testing.""" config_dict = {"key": "eng"} # The one default; may be overridden. config_dict.update(**kwargs) return Config(**config_dict)