def _check_ipa_phonemes(phone_to_examples: Dict[str, Set[str]], filepath: str): """Given the phonemes checks whether they are represented in the IPA. This will catch problematic phonemes, according to the current IPA standard supported by `ipapy`. In addition, it is likely to complain about highly specific allophones, which are likely to be present in languages which have highly phonetic representation of their phoneme inventory. For a current IPA chart, please see: https://www.internationalphoneticassociation.org/IPAcharts/IPA_chart_orig/IPA_charts_E.html """ bad_ipa_phonemes = frozenset( phone for phone in phone_to_examples.keys() if not (ipapy.is_valid_ipa(unicodedata.normalize("NFD", phone)) or phone in OTHER_VALID_IPA)) if len(bad_ipa_phonemes) and filepath.endswith("phonemic.tsv"): logging.warning("Found %d invalid IPA phones:", len(bad_ipa_phonemes)) phoneme_id = 1 for phoneme in bad_ipa_phonemes: bad_chars = [ f"[%d %04x %s %s]" % (i, ord(c), unicodedata.category(c), unicodedata.name(c)) for i, c in enumerate(ipapy.invalid_ipa_characters(phoneme)) ] logging.warning( "[%d] Non-IPA transcription: %s (%s)", phoneme_id, phoneme, " ".join(bad_chars), ) phoneme_id += 1
def test_invalid_ipa_characters_indices(self): values = [ (None, None), (u"", []), (u"foo", []), (u"L", [(0, u"L")]), (u"LfM", [(0, u"L"), (2, u"M")]), (u"fLoMo", [(1, u"L"), (3, u"M")]), (u"L\u0066\u02BCMoo", [(0, u"L"), (3, u"M")]), (u"LfM\u02BCoo", [(0, u"L"), (2, u"M")]), (u"fL\u031AMoo", [(1, u"L"), (3, u"M")]), (u"f\u006eL\u0361\u006doo", [(2, u"L")]), ] for v, e in values: self.assertEqual(invalid_ipa_characters(v, indices=True), e)