示例#1
0
def _check_ipa_phonemes(phone_to_examples: Dict[str, Set[str]], filepath: str):
    """Given the phonemes checks whether they are represented in the IPA.

    This will catch problematic phonemes, according to the current IPA standard
    supported by `ipapy`. In addition, it is likely to complain about highly
    specific allophones, which are likely to be present in languages which have
    highly phonetic representation of their phoneme inventory. For a current
    IPA chart, please see:

        https://www.internationalphoneticassociation.org/IPAcharts/IPA_chart_orig/IPA_charts_E.html
    """
    bad_ipa_phonemes = frozenset(
        phone for phone in phone_to_examples.keys()
        if not (ipapy.is_valid_ipa(unicodedata.normalize("NFD", phone))
                or phone in OTHER_VALID_IPA))
    if len(bad_ipa_phonemes) and filepath.endswith("phonemic.tsv"):
        logging.warning("Found %d invalid IPA phones:", len(bad_ipa_phonemes))
        phoneme_id = 1
        for phoneme in bad_ipa_phonemes:
            bad_chars = [
                f"[%d %04x %s %s]" %
                (i, ord(c), unicodedata.category(c), unicodedata.name(c))
                for i, c in enumerate(ipapy.invalid_ipa_characters(phoneme))
            ]
            logging.warning(
                "[%d] Non-IPA transcription: %s (%s)",
                phoneme_id,
                phoneme,
                " ".join(bad_chars),
            )
            phoneme_id += 1
示例#2
0
 def test_invalid_ipa_characters_indices(self):
     values = [
         (None, None),
         (u"", []),
         (u"foo", []),
         (u"L", [(0, u"L")]),
         (u"LfM", [(0, u"L"), (2, u"M")]),
         (u"fLoMo", [(1, u"L"), (3, u"M")]),
         (u"L\u0066\u02BCMoo", [(0, u"L"), (3, u"M")]),
         (u"LfM\u02BCoo", [(0, u"L"), (2, u"M")]),
         (u"fL\u031AMoo", [(1, u"L"), (3, u"M")]),
         (u"f\u006eL\u0361\u006doo", [(2, u"L")]),
     ]
     for v, e in values:
         self.assertEqual(invalid_ipa_characters(v, indices=True), e)
示例#3
0
 def test_invalid_ipa_characters_indices(self):
     values = [
         (None, None),
         (u"", []),
         (u"foo", []),
         (u"L", [(0, u"L")]),
         (u"LfM", [(0, u"L"), (2, u"M")]),
         (u"fLoMo", [(1, u"L"), (3, u"M")]),
         (u"L\u0066\u02BCMoo", [(0, u"L"), (3, u"M")]),
         (u"LfM\u02BCoo", [(0, u"L"), (2, u"M")]),
         (u"fL\u031AMoo", [(1, u"L"), (3, u"M")]),
         (u"f\u006eL\u0361\u006doo", [(2, u"L")]),
     ]
     for v, e in values:
         self.assertEqual(invalid_ipa_characters(v, indices=True), e)