예제 #1
0
 def test_filter_chars(self):
     s = u"əˈkiːn æˌkænˈθɑ.lə.d͡ʒi"
     s_ipa = IPAString(unicode_string=s)
     values = [
         (None, s),
         ([], s),
         ({}, s),
         (u"", s),
         (u"foo", s),
         (u"bar", s),
         (0, s),
         (1, s),
         (u"cns", u"knknθld͡ʒ"),
         (u"consonants", u"knknθld͡ʒ"),
         (u"vwl", u"əiææɑəi"),
         (u"vowels", u"əiææɑəi"),
         (u"cns_vwl", u"əkinækænθɑləd͡ʒi"),
         (u"letters", u"əkinækænθɑləd͡ʒi"),
         (u"cns_vwl_pstr", u"əˈkinækænˈθɑləd͡ʒi"),
         (u"cvp", u"əˈkinækænˈθɑləd͡ʒi"),
         (u"cns_vwl_pstr_long", u"əˈkiːnækænˈθɑləd͡ʒi"),
         (u"cvpl", u"əˈkiːnækænˈθɑləd͡ʒi"),
         (u"cns_vwl_str", u"əˈkinæˌkænˈθɑləd͡ʒi"),
         (u"cvs", u"əˈkinæˌkænˈθɑləd͡ʒi"),
         (u"cns_vwl_str_len", u"əˈkiːnæˌkænˈθɑləd͡ʒi"),
         (u"cvsl", u"əˈkiːnæˌkænˈθɑləd͡ʒi"),
         (u"cns_vwl_str_len_wb", u"əˈkiːn æˌkænˈθɑləd͡ʒi"),
         (u"cvslw", u"əˈkiːn æˌkænˈθɑləd͡ʒi"),
         (u"cns_vwl_str_len_wb_sb", u"əˈkiːn æˌkænˈθɑ.lə.d͡ʒi"),
         (u"cvslws", u"əˈkiːn æˌkænˈθɑ.lə.d͡ʒi"),
     ]
     for v, e in values:
         self.assertTrue(
             s_ipa.filter_chars(v).is_equivalent(
                 IPAString(unicode_string=e)))
예제 #2
0
def _extract_symbols(input_symbols: List[str], ignore_tones: bool, ignore_arcs: bool, replace_unknown_ipa_by: str = PADDING_SYMBOL) -> List[str]:
  symbols: List[str] = []
  input_word = ''.join(input_symbols)
  try:
    ipa = IPAString(unicode_string=input_word, ignore=False)
  except:
    ipa = IPAString(unicode_string=input_word, ignore=True)
    print(f"{input_word} conversion to IPA failed. Result would be: {ipa}.")
    result = [replace_unknown_ipa_by] * len(input_symbols)
    return result

  for char in ipa.ipa_chars:
    if char.is_diacritic or char.is_tone:
      if len(symbols) > 0:
        if char.is_tone and ignore_tones:
          continue
        # I think it is a bug in IPAString that the arc sometimes gets classified as diacritic and sometimes not
        if char.unicode_repr == ARC:
          if ignore_arcs:
            continue
          symbols.append(ARC)
        else:
          symbols[-1] += char.unicode_repr
    else:
      uc = char.unicode_repr
      if ignore_arcs:
        uc = uc.split(ARC)
        symbols.extend(uc)
      else:
        symbols.append(uc)

  return symbols
예제 #3
0
 def test_ipa_chars_set(self):
     IPAString().ipa_chars = [U2I[u"f"], U2I[u"o"], U2I[u"o"]]
     with self.assertRaises(TypeError):
         IPAString().ipa_chars = u"foo"
     with self.assertRaises(TypeError):
         IPAString().ipa_chars = [U2I[u"f"], None]
     with self.assertRaises(TypeError):
         IPAString().ipa_chars = [U2I[u"f"], u"o", u"o"]
예제 #4
0
 def test_add(self):
     values = [
         (None, None, 0),
         (None, u"a", 1),
         (u"a", None, 1),
         (u"a", u"b", 2),
         (u"f\u006e\u0361\u006doo", u"", 4),
         (u"f\u006e\u0361\u006doo", u"foo", 7),
     ]
     for v1, v2, e in values:
         self.assertEqual(
             len(
                 IPAString(unicode_string=v1) +
                 IPAString(unicode_string=v2)), e)
예제 #5
0
 def test_init_ipa_chars_bad(self):
     values = [
         1,
         u"",
         "",
         u"foo",
         "foo",
         {
             "k": "v"
         },
         [None],
         [1],
         [u""],
         [""],
         [u"foo"],
         ["foo"],
         [{
             "k": "v"
         }],
         [U2I[u"f"], None],
         ["f", U2I[u"o"], U2I[u"o"]],
     ]
     for v in values:
         with self.assertRaises(TypeError):
             IPAString(ipa_chars=v)
예제 #6
0
 def test_map_ipa_string_ignore(self):
     mapper = KirshenbaumMapper()
     values = [
         (u"", u""),
         (u"foo", u"foo"),
         (u"\u0070\u032A", u"p["),
         (u"\u025F", u"J"),
         (u"\u0294", u"?"),
         (u"foo\u025F\u0294", u"fooJ?"),
         (u"fo\u02C8o\u025F\u0294", u"fo'oJ?"),
         (u"foo bar", u"foo#bar<trl>"),
         (u"\u0261\u0067", u"gg"),
         (u"ma\u0272ana", u"man^ana"),
         (u"\u02A3", u"dz"),
         (u"\u02A7", u"tS"),
         (u"L", u""),
         (u"foo", u"foo"),
         (u"\u0070\u032AL", u"p["),
         (u"L\u025FM", u"J"),
         (u"L\u0294M", u"?"),
         (u"fLoo\u025F\u0294M", u"fooJ?"),
         (u"fo\u02C8oL\u025F\u0294M", u"fo'oJ?"),
         (u"fooL MbarN", u"foo#bar<trl>"),
         (u"\u0261L\u0067", u"gg"),
         (u"mLa\u0272Mana", u"man^ana"),
         (u"L\u02A3", u"dz"),
         (u"\u02A7M", u"tS"),
     ]
     for v, e in values:
         self.assertEqual(
             mapper.map_ipa_string(IPAString(unicode_string=v, ignore=True),
                                   ignore=True), e)
예제 #7
0
 def test_is_equivalent(self):
     values = [
         (None, None, True),
         (None, u"", True),
         (u"", None, True),
         (u"", u"", True),
         (u"f", u"f", True),
         (u"f\u006e\u0361\u006d", u"f\u006e\u0361\u006d", True),
         (u"f\u006e\u0361\u006d", u"f\u006e\u035C\u006d", True),
         (u"f\u006e\u0361\u006d", u"f\u006e\u006d", True),
         (u"\u0074\u0361\u026C", u"\u019B", True),
     ]
     for v1, v2, e in values:
         self.assertEqual(
             IPAString(unicode_string=v1).is_equivalent(
                 IPAString(unicode_string=v2)), e)
예제 #8
0
파일: mapper.py 프로젝트: wgfi110/ipapy
    def map_unicode_string(self,
                           unicode_string,
                           ignore=False,
                           single_char_parsing=False,
                           return_as_list=False,
                           return_can_map=False):
        """
        Convert the given Unicode string, representing an IPA string,
        to a string containing the corresponding mapped representation.

        Return ``None`` if ``unicode_string`` is ``None``.

        :param str unicode_string: the Unicode string to be parsed
        :param bool ignore: if ``True``, ignore Unicode characters that are not IPA valid
        :param bool single_char_parsing: if ``True``, parse one Unicode character at a time
        :param bool return_as_list: if ``True``, return as a list of strings, one for each IPAChar,
                                    instead of their concatenation (single str)
        :param bool return_can_map: if ``True``, return a pair ``(bool, str)``, where the first element
                                    says if the mapper can map all the IPA characters in the given IPA string,
                                    and the second element is either ``None`` or the mapped string/list
        :rtype: str or (bool, str) or (bool, list)
        """
        if unicode_string is None:
            return None
        ipa_string = IPAString(unicode_string=unicode_string,
                               ignore=ignore,
                               single_char_parsing=single_char_parsing)
        return self.map_ipa_string(ipa_string=ipa_string,
                                   ignore=ignore,
                                   return_as_list=return_as_list,
                                   return_can_map=return_can_map)
예제 #9
0
 def _process_phonology(self, string):
     """Process phonology."""
     syll = "".join(string.split())
     try:
         syll = "".join([str(x) for x in IPAString(unicode_string=syll)])
         return segment_phonology(syll, to_keep=self.diacritics)
     except ValueError:
         return None
예제 #10
0
def get_syllable_features(syl):
    from ipapy.ipastring import IPAString
    import unicodedata

    if syl['ipa'] == 'R':  # represents silence syllables
        return None

    try:
        ipa = IPAString(unicode_string=syl['ipa'].replace('I', 'ɪ'))
    except ValueError:
        # manually fix some errors
        ipa = IPAString(unicode_string={
            'Nis': 'nis',
            'ɾoU': 'ɾou',
            'Vin': 'vin',
            'vIN': 'vɪn',
            'ɾe-': 'ɾe'
        }[syl['ipa']])

    start, end, idx = None, None, 0
    for ph in ipa:
        if ph.is_vowel:
            if start is None:
                start = idx
            if end is not None:
                raise ValueError("Discontinued nucleus in: {}".format(str(ph)))
        elif ph.is_consonant:
            if start is not None and end is None:
                end = idx

        idx += len(
            [c for c in ph.unicode_repr if unicodedata.category(c) != 'Mn'])

    if start is None:
        assert end is None
        return '', syl['ipa'], ''

    onset = syl['ipa'][:start]
    nucleus = syl['ipa'][start:end]
    if end is not None:
        coda = syl['ipa'][end:]
    else:
        coda = ''

    return onset, nucleus, coda
예제 #11
0
 def test_init_ipa_chars(self):
     values = [
         None,
         [],
         [U2I[u"f"]],
         [U2I[u"f"], U2I[u"o"], U2I[u"o"]],
     ]
     for v in values:
         IPAString(ipa_chars=v)
예제 #12
0
 def test_init_unicode_string_ignore(self):
     values = [
         u"L",
         u"fL",
         u"fLooM",
         u"/\u0066\u02BCoo/",
         u"[f\u006e\u0361\u006doo]",
     ]
     for v in values:
         IPAString(unicode_string=v, ignore=True)
예제 #13
0
 def test_init_unicode_string(self):
     values = [
         None,
         u"",
         u"f",
         u"foo",
         u"\u0066\u02BCoo",
         u"f\u006e\u0361\u006doo",
     ]
     for v in values:
         IPAString(unicode_string=v)
예제 #14
0
 def test_init_unicode_len(self):
     values = [
         (None, 0),
         (u"", 0),
         (u"f", 1),
         (u"foo", 3),
         (u"\u0066\u02BCoo", 3),
         (u"f\u006e\u0361\u006doo", 4),
     ]
     for v, e in values:
         self.assertEqual(len(IPAString(unicode_string=v)), e)
예제 #15
0
    def _parse_phonemes(phonemes):
        """Parse the incoming tuple of phonemes as IPA characters."""
        phonemes = [
            IPAString(unicode_string=p, single_char_parsing=True)
            for p in phonemes
        ]

        vowels = filter(lambda x: x[0].is_vowel, phonemes)
        consonants = filter(lambda x: not x[0].is_vowel, phonemes)

        return list(vowels), list(consonants)
예제 #16
0
def i2t(ipa):
    ipa = unicodedata.normalize('NFD', ipa)
    ipa = re.sub(r'^\*', '', ipa)
    tokens = ipa2tokens(ipa, merge_vowels=False, merge_geminates=False)
    ret = list()
    for t in tokens:
        # NOTE(j_luo) Stress symbol is not handled by `ipapy`'s canonicalization process.
        t = t.replace("'", 'ˈ')
        # NOTE(j_luo) Not sure what these symbols mean.
        t = t.replace('̣', '').replace('̧', '').replace('̦', '')
        ret.append(str(IPAString(unicode_string=t)))
    return ret
예제 #17
0
    def _parse_phonemes(self, phonemes):
        """Parse the incoming tuple of phonemes as IPA characters."""
        phonemes = [
            IPAString(unicode_string=p, single_char_parsing=True)
            for p in phonemes
        ]

        for x in phonemes:
            if not self._is_valid(x):
                raise ValueError("{} was not a valid phoneme.".format(x))
        vowels = filter(lambda x: x[0].is_vowel, phonemes)
        consonants = filter(lambda x: x[0].is_consonant, phonemes)

        return list(vowels), list(consonants)
예제 #18
0
 def test_canonical_representation(self):
     values = [
         (None, 0),
         (u"", 0),
         (u"f", 1),
         (u"foo", 3),
         (u"\u0066\u02BCoo", 3),
         (u"f\u006e\u0361\u006doo", 4),
     ]
     for v, e in values:
         self.assertEqual(
             len(
                 IPAString(
                     unicode_string=v,
                     single_char_parsing=True).canonical_representation), e)
예제 #19
0
 def test_init_unicode_string_bad(self):
     values = [
         b"",
         b"f",
         b"foo",
         b"\u0066\u02BCoo",
         b"f\u006e\u0361\u006doo",
         u"L",
         u"fL",
         u"fLooM",
         u"/\u0066\u02BCoo/",
         u"[f\u006e\u0361\u006doo]",
     ]
     for v in values:
         with self.assertRaises(ValueError):
             IPAString(unicode_string=v)
예제 #20
0
 def test_map_ipa_string(self):
     mapper = ARPABETMapper()
     values = [
         (u"", u""),
         (u"p", u"P"),
         (u"p\u03B8", u"PTH"),
         (u"\u027E", u"DX"),
         (u"p\u0258\u026A", u"PEY"),
         (u"p\u0258\u026Aw", u"PEYW"),
         (u"p\u0258\u026A\u0258\u026Aw", u"PEYEYW"),
         (u"p\u0258\u026A\u0251w", u"PEYAAW"),
         (u"\u006A\u0075", u"YUW"),
     ]
     for v, e in values:
         self.assertEqual(
             mapper.map_ipa_string(IPAString(unicode_string=v)), e)
예제 #21
0
파일: __main__.py 프로젝트: wgfi110/ipapy
def command_canonize(string, vargs):
    """
    Print the canonical representation of the given string. 

    It will replace non-canonical compound characters
    with their canonical synonym.

    :param str string: the string to act upon
    :param dict vargs: the command line arguments
    """
    try:
        ipa_string = IPAString(
            unicode_string=string,
            ignore=vargs["ignore"],
            single_char_parsing=vargs["single_char_parsing"])
        print(vargs["separator"].join([(u"%s" % c) for c in ipa_string]))
    except ValueError as exc:
        print_error(str(exc))
예제 #22
0
 def test_can_map_ipa_string(self):
     mapper = ARPABETMapper()
     values = [
         (u"", True),
         (u"p", True),
         (u"p\u03B8", True),
         (u"\u027E", True),
         (u"\u0258\u026A", True),
         (u"p\u0258\u026A", True),
         (u"p\u0258\u026Aw", True),
         (u"p\u0258\u026A\u0258\u026Aw", True),
         (u"p\u0258\u026A\u0251w", True),
         (u"\u006A\u0075", True),
         (u"\u1DC6", False),  # valid IPA char, unmapped in Kirshenbaum
         (u"p\u1DC6b", False),  # valid IPA char, unmapped in Kirshenbaum
     ]
     for v, e in values:
         self.assertEqual(
             mapper.can_map_ipa_string(IPAString(unicode_string=v)), e)
예제 #23
0
 def test_map_ipa_string(self):
     mapper = KirshenbaumMapper()
     values = [
         (u"", u""),
         (u"foo", u"foo"),
         (u"\u0070\u032A", u"p["),
         (u"\u025F", u"J"),
         (u"\u0294", u"?"),
         (u"foo\u025F\u0294", u"fooJ?"),
         (u"fo\u02C8o\u025F\u0294", u"fo'oJ?"),
         (u"foo bar", u"foo#bar<trl>"),
         (u"\u0261\u0067", u"gg"),
         (u"ma\u0272ana", u"man^ana"),
         (u"\u02A3", u"dz"),
         (u"\u02A7", u"tS"),
     ]
     for v, e in values:
         self.assertEqual(
             mapper.map_ipa_string(IPAString(unicode_string=v)), e)
예제 #24
0
def extract_symbols(ipa: str):
    symbols = []

    for ch in ipa:
        x = IPAString(unicode_string=ch, ignore=True)
        x_len = len(x)
        was_ignored = x_len == 0

        if was_ignored:
            symbols.append(ch)
        elif x_len == 1:
            char = x[0]
            if char.is_diacritic:
                if len(symbols) > 0:
                    symbols[-1] += ch
            else:
                symbols.append(ch)
        else:
            assert False
    return symbols
예제 #25
0
파일: __main__.py 프로젝트: wgfi110/ipapy
def command_chars(string, vargs):
    """
    Print a list of all IPA characters in the given string.

    It will print the Unicode representation, the full IPA name,
    and the Unicode "U+"-prefixed hexadecimal codepoint representation
    of each IPA character.

    :param str string: the string to act upon
    :param dict vargs: the command line arguments
    """
    try:
        ipa_string = IPAString(
            unicode_string=string,
            ignore=vargs["ignore"],
            single_char_parsing=vargs["single_char_parsing"])
        for c in ipa_string:
            print(u"'%s'\t%s (%s)" %
                  (c.unicode_repr, c.name, unicode_to_hex(c.unicode_repr)))
    except ValueError as exc:
        print_error(str(exc))
예제 #26
0
 def test_can_map_ipa_string(self):
     mapper = KirshenbaumMapper()
     values = [
         (u"", True),
         (u"foo", True),
         (u"\u0070\u032A", True),
         (u"\u025F", True),
         (u"\u0294", True),
         (u"foo\u025F\u0294", True),
         (u"fo\u02C8o\u025F\u0294", True),
         (u"foo bar", True),
         (u"\u0261\u0067", True),
         (u"ma\u0272ana", True),
         (u"\u02A3", True),
         (u"\u02A7", True),
         (u"\u1DC6", False),  # valid IPA char, unmapped in Kirshenbaum
         (u"foo\u1DC6bar",
          False),  # valid IPA char, unmapped in Kirshenbaum
     ]
     for v, e in values:
         self.assertEqual(
             mapper.can_map_ipa_string(IPAString(unicode_string=v)), e)
예제 #27
0
def convert_stress(ipa: str) -> List[str]:
    tokens = i2t(ipa)
    should_stress = False
    ret = list()
    for t in tokens:
        if t.startswith('ˈ') or t.startswith("'"):
            t = t[1:]
            should_stress = True
        elif t.startswith('ˌ'):
            t = t[1:]
        t = str(
            IPAString(unicode_string=unicodedata.normalize('NFD', t),
                      ignore=True))
        seg = _processor.process(t)
        if isinstance(seg, Nphthong) or (isinstance(seg, Segment)
                                         and seg.is_vowel()):
            if should_stress:
                t = t + '{+}'
                should_stress = False
            else:
                t = t + '{-}'
        ret.append(t)
    assert not should_stress
    return ret
예제 #28
0
def test():
    """test i2a function, print the original IPA and arpabet.
    """
    a = u"ˈɑkən"
    print IPAString(unicode_string=a)
    print i2a(a)
예제 #29
0
def standardize(ph: str, ignore: bool = False) -> str:
    ph = str(IPAString(unicode_string=ph, ignore=ignore))
    return unicodedata.normalize('NFD', ph)
예제 #30
0
def extract_symbols(ipa: str):
    symbols = []

    for ch in ipa:
        x = IPAString(unicode_string=ch, ignore=True)
        x_len = len(x)
        was_ignored = x_len == 0

        if was_ignored:
            symbols.append(ch)
        elif x_len == 1:
            char = x[0]
            if char.is_diacritic:
                if len(symbols) > 0:
                    symbols[-1] += ch
            else:
                symbols.append(ch)
        else:
            assert False
    return symbols


if __name__ == "__main__":
    y = u"ˈprɪnɪŋ, ɪn ðə ˈoʊnli sɛns wɪθ wɪʧ wi ər æt ˈprɛzənt kənˈsərnd, ˈdɪfərz frəm moʊst ɪf nɑt frəm ɔl ðə ɑrts ənd kræfts ˌrɛprɪˈzɛnɪd ɪn ðə ˌɛksəˈbɪʃən."
    #y = u"wɪʧ"
    #y = "ɪʃn̩'"
    s_ipa = IPAString(unicode_string=y, ignore=True)
    tmp = extract_symbols(y)
    print(tmp)