def _build_config_and_filter_files(config_settings, wiki_name, dialect_suffix=""): path_affix = f'../tsv/{config_settings["key"]}_{dialect_suffix}' phonemic_config = wikipron.Config(**config_settings) phonemic_path = f"{path_affix}phonemic.tsv" phonemic_count = _call_scrape(config_settings, phonemic_config, phonemic_path) phonetic_config = wikipron.Config(phonetic=True, **config_settings) phonetic_path = f"{path_affix}phonetic.tsv" phonetic_count = _call_scrape(config_settings, phonetic_config, phonetic_path) # Removes TSVs with less than 100 lines. # Log language name and count to check whether Wikipron scraped any data. if phonemic_count < 100: logger.info( ('"%s" (count: %s) has less than ' "100 entries in phonemic transcription."), wiki_name, phonemic_count, ) os.remove(phonemic_path) if phonetic_count < 100: os.remove(phonetic_path) logger.info( ('"%s" (count: %s) has less than ' "100 entries in phonetic transcription."), wiki_name, phonetic_count, )
def _build_scraping_config(config_settings, wiki_name, dialect_suffix=""): path_affix = f'../tsv/{config_settings["key"]}_{dialect_suffix}' phonemic_config = wikipron.Config(**config_settings) phonemic_path = f"{path_affix}phonemic.tsv" _call_scrape(config_settings, phonemic_config, phonemic_path) phonetic_config = wikipron.Config(phonetic=True, **config_settings) phonetic_path = f"{path_affix}phonetic.tsv" _call_scrape(config_settings, phonetic_config, phonetic_path)
def _build_scraping_config(config_settings: Dict[str, Any], wiki_name: str, dialect_suffix: str = "") -> None: path_affix = f'../tsv/{config_settings["key"]}_{dialect_suffix}' whitelist_path_affix = ( f"../whitelist/{config_settings['key']}_{dialect_suffix}") # Configures phonemic TSV. phonemic_config = wikipron.Config(**config_settings) phonemic_path = f"{path_affix}phonemic.tsv" # Checks for phonemic whitelist file. whitelist_phonemic = f"{whitelist_path_affix}phonemic.whitelist" if os.path.exists(whitelist_phonemic): logging.info( "Phonemic whitelist found for '%s' at '%s'", config_settings["key"], whitelist_phonemic, ) phonemic_path_filtered = f"{path_affix}phonemic_filtered.tsv" phoneme_set = frozenset(_whitelist_reader(whitelist_phonemic)) _call_scrape( config_settings, phonemic_config, phonemic_path, phoneme_set, phonemic_path_filtered, ) else: _call_scrape(config_settings, phonemic_config, phonemic_path) # Configures phonetic TSV. phonetic_config = wikipron.Config(phonetic=True, **config_settings) phonetic_path = f"{path_affix}phonetic.tsv" # Checks for phonetic whitelist file. whitelist_phonetic = f"{whitelist_path_affix}phonetic.whitelist" if os.path.exists(whitelist_phonetic): logging.info( "Phonetic whitelist found for '%s' at '%s.'", config_settings["key"], whitelist_phonetic, ) phonetic_path_filtered = f"{whitelist_path_affix}phonetic.whitelist" phone_set = frozenset(_whitelist_reader(whitelist_phonetic)) _call_scrape( config_settings, phonetic_config, phonetic_path, phone_set, phonetic_path_filtered, ) else: _call_scrape(config_settings, phonetic_config, phonetic_path)
def _build_scraping_config(config_settings: Dict[str, Any], dialect_suffix: str = "") -> None: path_affix = f'../tsv/{config_settings["key"]}_{dialect_suffix}' phones_path_affix = f"../phones/{config_settings['key']}_{dialect_suffix}" # Configures phonemic TSV. phonemic_config = wikipron.Config(**config_settings) phonemic_path = f"{path_affix}phonemic.tsv" # Checks for phonemic phones file. phones_phonemic = f"{phones_path_affix}phonemic.phones" if os.path.exists(phones_phonemic): logging.info( "Phonemic phones found for %r at %r", config_settings["key"], phones_phonemic, ) phonemic_path_filtered = f"{path_affix}phonemic_filtered.tsv" phoneme_set = frozenset(_phones_reader(phones_phonemic)) _call_scrape( config_settings, phonemic_config, phonemic_path, phoneme_set, phonemic_path_filtered, ) else: _call_scrape(config_settings, phonemic_config, phonemic_path) # Configures phonetic TSV. phonetic_config = wikipron.Config(phonetic=True, **config_settings) phonetic_path = f"{path_affix}phonetic.tsv" # Checks for phonetic phones file. phones_phonetic = f"{phones_path_affix}phonetic.phones" if os.path.exists(phones_phonetic): logging.info( "Phonetic phones found for %r at %r", config_settings["key"], phones_phonetic, ) phonetic_path_filtered = f"{path_affix}phonetic_filtered.tsv" phone_set = frozenset(_phones_reader(phones_phonetic)) _call_scrape( config_settings, phonetic_config, phonetic_path, phone_set, phonetic_path_filtered, ) else: _call_scrape(config_settings, phonetic_config, phonetic_path)
def test_language_coverage(): """Check if WikiPron covers languages with a sufficient amount of data. If any warnings are raised, they should be suppressed by expanding the LANGUAGE_CODES dict to handle the relevant languages. """ categories = _get_language_categories() sizes = _get_language_sizes(categories) for language, size in sizes.items(): if size < _MIN_LANGUAGE_SIZE: continue if language in ("Mon", "Translingual"): # "mon" is the ISO 639 code for Mongolian, but there is also # the Mon language (ISO 639 code: "mnw"). continue try: language_code = iso639.to_iso639_2(language) except iso639.NonExistentLanguageError: # Check if WikiPron can handle `language` directly. language_code = language try: language_inferred = wikipron.Config(key=language_code).language except iso639.NonExistentLanguageError: warnings.warn(f'WikiPron cannot handle "{language}".') continue if language_inferred != language: warnings.warn( f'WikiPron resolves the key "{language_code}" to ' f'"{language_inferred}", ' f'which is not "{language}" on Wiktionary.' )
def _build_scraping_config( config_settings: Dict[str, Any], path_affix: str, phones_path_affix: str ) -> None: # Configures broad TSV. broad_config = wikipron.Config(**config_settings) broad_path = f"{path_affix}broad.tsv" # Checks for broad phones file. phones_broad = f"{phones_path_affix}broad.phones" if os.path.exists(phones_broad): logging.info( "Broad transcription phones found for %r at %r", config_settings["key"], phones_broad, ) broad_path_filtered = f"{path_affix}broad_filtered.tsv" phoneme_set = frozenset(_phones_reader(phones_broad)) _call_scrape( config_settings, broad_config, broad_path, phoneme_set, broad_path_filtered, ) else: _call_scrape(config_settings, broad_config, broad_path) # Configures narrow TSV. narrow_config = wikipron.Config(narrow=True, **config_settings) narrow_path = f"{path_affix}narrow.tsv" # Checks for narrow phones file. phones_narrow = f"{phones_path_affix}narrow.phones" if os.path.exists(phones_narrow): logging.info( "Narrow phones found for %r at %r", config_settings["key"], phones_narrow, ) narrow_path_filtered = f"{path_affix}narrow_filtered.tsv" phone_set = frozenset(_phones_reader(phones_narrow)) _call_scrape( config_settings, narrow_config, narrow_path, phone_set, narrow_path_filtered, ) else: _call_scrape(config_settings, narrow_config, narrow_path)
def _check_language_code_against_wiki(language_code: str, language: str) -> None: """Checks if WikiPron can handle the assigned ISO language code.""" try: language_inferred = wikipron.Config(key=language_code).language except iso639.NonExistentLanguageError: logging.warning("WikiPron cannot handle %r", language) else: if language_inferred != language: logging.warning( "WikiPron resolves the key %r to %r " "listed as %r on Wiktionary", language_code, language_inferred, language, )
# -*- coding: utf-8 -*- """ Created on Sat May 30 14:54:44 2020 @author: qtckp """ import wikipron import os import json dic = {} config = wikipron.Config(key="en") t = 0 for word, pron in wikipron.scrape(config): t += 1 if t % 100 == 0: print(f'{t} {word} {pron}') if len(word) > 1: dic[word] = pron with open("english.json", "w") as write_file: json.dump(dic, write_file, indent=4)