예제 #1
0
파일: scrape.py 프로젝트: lfashby/wikipron
def _scrape_once(data, config: Config) -> Iterator[WordPronPair]:
    session = requests_html.HTMLSession()
    for member in data["query"]["categorymembers"]:
        title = member["title"]
        timestamp = member["timestamp"]
        config.restart_key = member["sortkey"]
        if _skip_word(title, config.skip_spaces_word) or _skip_date(
                timestamp, config.cut_off_date):
            continue
        request = session.get(_PAGE_TEMPLATE.format(word=title),
                              timeout=10,
                              headers=HTTP_HEADERS)

        for word, pron in config.extract_word_pron(title, request, config):
            # Pronunciation processing is done in NFD-space;
            # we convert back to NFC afterwards.
            normalized_pron = unicodedata.normalize("NFC", pron)
            # 'cast' is required 'normalize' doesn't return a 'Pron'
            yield word, cast(Pron, normalized_pron)
예제 #2
0
def _scrape_once(data, config: Config) -> Iterator[WordPronPair]:
    session = requests_html.HTMLSession()
    for member in data["query"]["categorymembers"]:
        word = member["title"]
        date = member["timestamp"]
        if _skip_word(word) or _skip_date(date, config.cut_off_date):
            continue
        request = session.get(_PAGE_TEMPLATE.format(word=word), timeout=10)
        for word, pron in config.extract_word_pron(word, request, config):
            yield word, pron
예제 #3
0
def _scrape_once(data, config: Config) -> Iterator[WordPronPair]:
    session = requests_html.HTMLSession()
    for member in data["query"]["categorymembers"]:
        word = member["title"]
        date = member["timestamp"]
        if _skip_word(word, config.no_skip_spaces_word) or _skip_date(
            date, config.cut_off_date
        ):
            continue
        request = session.get(_PAGE_TEMPLATE.format(word=word), timeout=10)
        for word, pron in config.extract_word_pron(word, request, config):
            # Pronunciation processing is done in NFD-space;
            # we convert back to NFC aftewards.
            yield word, unicodedata.normalize("NFC", pron)
예제 #4
0
def main() -> None:
    logging.basicConfig(format="%(levelname)s: %(message)s", level="INFO")
    args = _get_cli_args(sys.argv[1:])
    config = Config(**args.__dict__)
    _scrape_and_write(config)
예제 #5
0
def config_factory(**kwargs) -> Config:
    """Create a Config object for testing."""
    config_dict = {"key": "eng"}  # The one default; may be overridden.
    config_dict.update(**kwargs)
    return Config(**config_dict)