Exemplo n.º 1
0
 def test_length(self):
     parsed = self.reader.parse_transcript("baːwɑ")
     assert parsed[0] == Token("p(p, b)")
     assert parsed[1] == Token(
         "aː(aː, ɑː)")  # and NOT the same as a(a, ɑ, ə, æ)
     assert parsed[2] == Token("w")
     assert parsed[3] == Token("a(a, ɑ, ə, æ)")
Exemplo n.º 2
0
 def test_noncombining_forms_are_identical_to_combining(self):
     nfd = "".join([
         unicodedata.lookup("LATIN SMALL LETTER A"),
         unicodedata.lookup("COMBINING ACUTE ACCENT"),
     ])
     nfc = unicodedata.lookup("LATIN SMALL LETTER A WITH ACUTE")
     assert Token(nfc) == Token(nfd)
Exemplo n.º 3
0
 def test_allophone(self):
     parsed = self.reader.parse_transcript("uwʊl")
     assert parsed[0] == Token("u(u, ʊ)")
     assert parsed[1] == Token("w")
     assert parsed[2] == Token("u(u, ʊ)")
     assert parsed[3] == Token("l")
     assert parsed[0] == parsed[2]
Exemplo n.º 4
0
 def test_match_inventory_allophone(self):
     inv = [
         Token("i(i, í, ì, ị, ỉ, ĩ)"),
     ]
     o = Ortheme("<x>=/í/", inventory=inv)
     assert o.graphemes == [Token('x')]
     assert o.phonemes == inv
Exemplo n.º 5
0
    def test_space_ends_word(self):
        # "cit ʔa" being parsed as
        # 440.    iˑ                      vowel
        # 441.    c                       consonant
        # 442.    i                       missing       +
        # 443.    t ʔ                     missing       *
        # 444.    a                       vowel

        f = FileReader()
        f.data['consonants'] = f.parse_inventory("c, t(t, tⁿ)", 'consonant')
        f.data['vowels'] = f.parse_inventory("iˑ, a", 'vowel')
        # add known missings
        f.known_missings = [
            MissingToken("i", known_missing=True),
            MissingToken("ʔ", known_missing=True),
        ]
        transcript = f.standardise("iˑcit ʔa")
        parsed = f.parse_transcript(transcript)
        assert parsed[0] == Token("iˑ")
        assert parsed[1] == Token("c")
        assert parsed[2] == MissingToken("i", known_missing=True)
        assert parsed[3] == Token("t(t, tⁿ)")
        assert parsed[4] == Token(" ")
        assert parsed[5] == MissingToken("ʔ", known_missing=True)
        assert parsed[6] == Token("a")
Exemplo n.º 6
0
 def test_15(self):
     o = Ortheme("<i(i, í, ì, ị, ỉ, ĩ)>-<y(y, ý, ỳ, ỷ, ỹ, ỵ)>=/i/")
     assert o.graphemes == [
         Token("i(i, í, ì, ị, ỉ, ĩ)"),
         Token("y(y, ý, ỳ, ỷ, ỹ, ỵ)")
     ]
     assert o.phonemes == [Token('i')]
Exemplo n.º 7
0
 def test_missing_token(self):
     string = "ɔlaɣ"
     parsed = self.reader.parse_transcript(string)
     assert len(parsed) == len(string)
     assert parsed[0] == MissingToken("ɔ")
     assert parsed[1] == Token("l")
     assert parsed[2] == Token("a(a, ɑ, ə, æ)")
     assert parsed[3] == MissingToken("ɣ")
Exemplo n.º 8
0
 def test_duplication(self):
     string = "llall"
     parsed = self.reader.parse_transcript(string)
     assert len(parsed) == len(string)
     assert parsed[0] == Token("l")
     assert parsed[1] == Token("l")
     assert parsed[2] == Token("a(a, ɑ, ə, æ)")
     assert parsed[3] == Token("l")
     assert parsed[4] == Token("l")
Exemplo n.º 9
0
 def test_slippage_and_duplication(self):
     string = "ɔlalɣ"
     parsed = self.reader.parse_transcript(string)
     assert len(parsed) == len(string)
     assert parsed[0] == MissingToken("ɔ")
     assert parsed[1] == Token("l")
     assert parsed[2] == Token("a(a, ɑ, ə, æ)")
     assert parsed[3] == Token("l")
     assert parsed[4] == MissingToken("ɣ")
Exemplo n.º 10
0
 def test_rhotic_hook(self):
     # lia˞u˞
     f = FileReader()
     f.data['consonants'] = f.parse_inventory("l", 'consonant')
     f.data['vowels'] = f.parse_inventory("i, au(au, a˞u˞)", 'vowels')
     transcript = f.standardise('lia˞u˞')
     parsed = f.parse_transcript(transcript)
     assert parsed[0] == Token("l")
     assert parsed[1] == Token("i")
     assert parsed[2] == Token("au(au, a˞u˞)")
Exemplo n.º 11
0
 def test_identified_missing_characters(self):
     expected_missings = [
         Token("oː"),
         Token("ɣ"),
         Token("ɔ"),
         Token("ç"),
         MissingToken('ɂ')
     ]
     for m in self.reader.errors:
         assert m in expected_missings, '%s is not in expected_missings' % m
Exemplo n.º 12
0
 def test_shilluk(self):
     f = FileReader()
     f.data['consonants'] = f.parse_inventory("ŋ", 'consonant')
     f.data['vowels'] = f.parse_inventory(
         "ɪ(ɪ́, ɪ̄, ɪ̀, ɪ̌, ɪ̂, ɪ̂́), a(á, ā, à, ǎ, â), ɪː(ɪ́ː, ɪ̄ː, ɪ̀ː, ɪ̌ː, ɪ̂ː, ɪ̂́ː)",
         'vowels')
     transcript = f.standardise("ɪ̂́ŋ-à")
     parsed = f.parse_transcript(transcript)
     assert parsed[0] == Token("ɪ(ɪ́, ɪ̄, ɪ̀, ɪ̌, ɪ̂, ɪ̂́)")
     assert parsed[1] == Token("ŋ")
     assert parsed[2] == Token("-")
     assert parsed[3] == Token("a(á, ā, à, ǎ, â)")
Exemplo n.º 13
0
    def test_toIPA_uses_lowercase(self):
        text = self.reader.toIPA("PTKCH")
        expected = [
            Token('p(p, pʰ, pʷ)'),
            Token('t'),  # lost 't̪' as n:n
            Token('k(k, c, kʰ, cʰ, kʷ)'),
            Token('tʃ'),  # NOT <k(...)> and <h>
        ]
        assert len(expected) == len(text)

        for i, e in enumerate(expected):
            assert e == text[i], 'Mismatch %r : %r' % (e, text[i])
Exemplo n.º 14
0
 def test_ellipsis(self):
     # an error with ellipsis. [...]
     f = FileReader()
     f.data['consonants'] = f.parse_inventory("l n", 'consonant')
     f.data['vowels'] = f.parse_inventory("", 'vowels')
     f.known_missings.update(f.parse_list(["/[...]/"]))
     transcript = f.standardise("l [...] n")
     parsed = f.parse_transcript(transcript)
     assert parsed[0] == Token("l")
     assert parsed[1] == Token(" ")
     assert parsed[2] == Token("[...]")
     assert parsed[3] == Token(" ")
     assert parsed[4] == Token("n")
Exemplo n.º 15
0
 def test_one(self):
     # parsed as
     #130.	j(j, ɲ)                            	consonant
     #131.	o(o, ó, ò, ô, ŏ̀)                  	vowel
     #132.	↗                                  	other
     #133.	óʔẁ̰                               	missing   	*
     #134.	                                   	punctuation
     transcript = self.f.standardise("jó↗óʔẁ̰")
     parsed = self.f.parse_transcript(transcript)
     assert parsed[0] == Token("j(j, ɲ)")
     assert parsed[1] == Token("o(o, ó, ò, ô, ŏ̀)")
     assert parsed[2] == MissingToken("↗")
     assert parsed[3] == Token("o(o, ó, ò, ô, ŏ̀)")
     assert parsed[4] == Token("ʔw̰(ʔw̰, ʔŋ̰ʷ, ʔẁ̰)")
Exemplo n.º 16
0
    def _fulltest(self, s, allophones, names):
        """Runs a full set of tests on the string `s`
        
        1. tests that the .raw value matches the original input `s`
        2. tests that the .token matches the original input `s`
        3. tests that the __repr__ is correct
        4. tests that the allophones are correctly extracted (list
            expected allophones in parameter `allophones`.
            Can be None)
        5. tests that the identified token names are correct (list
            expected names in parameter `names`)
        6. tests that variants list matches the expected
        """
        # standardise first as that's what Token does.
        s = unicodedata.normalize("NFC", s)
        if allophones:
            allophones = [unicodedata.normalize("NFC", a) for a in allophones]

        t = Token(s)
        try:
            # 1. tests that the .raw value matches the original input `s`
            assert t.raw == s, "Raw value %r != expected %r" % (t.raw, s)
            # 2. tests that the .token matches the original input `s`
            assert t.token == s.split("(")[0], \
                "Token %r != expected %r" % (t.token, s)
            # 3. tests that the __repr__ is correct
            assert repr(
                t) == '<%s>' % s, "Repr %r != expected <%r>" % (repr(t), s)
            # 4. tests that the allophones are correctly extracted
            assert t.allophones == allophones, \
                "Allophones %r != %r" % (t.allophones, allophones)
            # 5. tests that the identified token names are correct
            assert len(t.names) == len(names), \
                "Uneven amount of names: %r != %r" % (t.names, names)
            for i, name in enumerate(t.names):
                assert name == names[i], "Unexpected name %d, %s != %s" % (
                    i, name, names[i])
            # 6. tests that variants list matches the expected
            if t.allophones is None:
                assert len(t.variants) == 1
                assert t.variants == [t.token]
            else:
                assert len(t.variants) == len(t.allophones)
                for a in t.allophones:
                    assert a in t.variants
        except AssertionError as e:  # pragma: no cover
            t.debug()
            raise e
        return True
Exemplo n.º 17
0
    def test_maximal_error(self):
        # should identify the missing token as "o:" not ":"
        transcript = 'oːlal'
        f = FileReader()
        f.data['consonants'] = f.parse_inventory("l, ɭ, ʎ, r(r, ɾ, ɹ)",
                                                 'consonant')
        f.data['vowels'] = f.parse_inventory(
            "a(a, ɑ, ə, æ), o(o, ɒ), u(u, ʊ), uː", 'vowel')

        transcript = f.standardise(transcript)
        parsed = f.parse_transcript(transcript)
        assert parsed[0] == MissingToken("oː")
        assert parsed[1] == Token("l")
        assert parsed[2] == Token("a(a, ɑ, ə, æ)")
        assert parsed[3] == Token("l")
Exemplo n.º 18
0
 def test_5pc(self):
     res = [c for c in self.cov if c.ppercent == 5][0]
     assert res.ppercent == 5
     assert res.position == 42  #  843 / 20 =  42.15
     assert res.observed == 15
     assert res.opercent == (15 / res.total_inv) * 100
     assert res.transcript[-1] == Token('r(r, ɾ, ɹ)')
Exemplo n.º 19
0
 def test_sandawe(self):
     # ǁ’àká being parsed as:
     # 489.    ‖                                      punctuation
     # 490.    ’                                      missing       *
     # 491.    a(a, á, à, ǎ, â)                       vowel
     # 492.    k                                      consonant
     #
     # ǁ’ is in the inventory but I think it's being overriden by the default ǁ in boundary tokens
     f = FileReader()
     f.data['consonants'] = f.parse_inventory("k, ǁ’", 'consonant')
     f.data['vowels'] = f.parse_inventory("a(a, á, à, ǎ, â)", 'vowels')
     transcript = f.standardise('ǁ’àká')
     parsed = f.parse_transcript(transcript)
     assert parsed[0] == Token("ǁ’")
     assert parsed[1] == Token("a(a, á, à, ǎ, â)")
     assert parsed[2] == Token("k")
     assert parsed[3] == Token("a(a, á, à, ǎ, â)")
Exemplo n.º 20
0
 def test_danish_overextension(self):
     # being parsed as  ... MissingToken("də") not MissingToken("d"),
     # Token("ə")
     transcript = 'b̥lɛːsdə'
     f = FileReader()
     f.data['consonants'] = f.parse_inventory(
         "b̥(b̥, b̥ʰ), d̥(d̥, d̥s), s, l(l, l̩)", 'consonant')
     f.data['vowels'] = f.parse_inventory("e(e, eː), ɛ(ɛ, ɛː), a, ɑ, ə",
                                          'vowel')
     transcript = f.standardise(transcript)
     parsed = f.parse_transcript(transcript)
     assert parsed[0] == MissingToken("b̥(b̥, b̥ʰ)")
     assert parsed[1] == Token("l(l, l̩)")
     assert parsed[2] == Token("ɛ(ɛ, ɛː)")
     assert parsed[3] == Token("s")
     assert parsed[4] == MissingToken("d")
     assert parsed[5] == Token("ə")
Exemplo n.º 21
0
 def test_sandawe_2(self):
     # ǀ’ùsù being parsed as:
     # 67.    |                                      punctuation
     # 68.    ’                                      missing       *
     # 69.    u(u, ú, ù, ǔ, û)                       vowel
     # 70.    s                                      consonant
     # 71.    u(u, ú, ù, ǔ, û)                   	vowel
     #
     # ǀ’ in inventory but I think it's being overriden by the default ǁ in boundary tokens
     f = FileReader()
     f.data['consonants'] = f.parse_inventory("s, ǀ’, x", 'consonant')
     f.data['vowels'] = f.parse_inventory("u(u, ú, ù, ǔ, û)", 'vowels')
     transcript = f.standardise('ǀ’ùsù')
     parsed = f.parse_transcript(transcript)
     assert parsed[0] == Token("ǀ’")
     assert parsed[1] == Token("u(u, ú, ù, ǔ, û)")
     assert parsed[2] == Token("s")
     assert parsed[3] == Token("u(u, ú, ù, ǔ, û)")
Exemplo n.º 22
0
 def test_s_COMBINING_INVERTED_BRIDGE_BELOW_allophone(self):
     # the reason this failed was that s̺ isn't in the allophones
     # so s̺ didn't match anything. This is fixed at the Token level
     # and checked in test_Token.test_initial_char_in_allophones
     f = FileReader()
     f.data['consonants'] = f.parse_inventory("s̺(s, s̬, s̺)", 'consonant')
     parsed = f.parse_transcript(f.standardise('s̺'))
     assert len(parsed) == 1
     assert parsed[0] == Token('s̺(s, s̬, s̺)')
Exemplo n.º 23
0
    def test_basaa_combining_n_only_attached_to_preceeding(self):
        # pêⁿbà being parsed as:
        # 43.	p                   	consonant
        # 44.	e(e, é, è, ě, ê)    	vowel
        # 45.	hⁿ                  	missing   	*
        # 46.	b                   	missing   	*
        # 47.	a(a, á, à, ǎ, â)    	vowel
        f = FileReader()
        f.data['consonants'] = f.parse_inventory("p, h, ⁿb", 'consonant')
        f.data['vowels'] = f.parse_inventory(
            "e(e, é, è, ě, ê), a(a, á, à, ǎ, â)", 'vowels')
        transcript = f.standardise('pêhⁿbà')
        parsed = f.parse_transcript(transcript)

        assert parsed[0] == Token("p")
        assert parsed[1] == Token("e(e, é, è, ě, ê)")
        assert parsed[2] == Token("h")
        assert parsed[3] == Token("ⁿb")
        assert parsed[4] == Token("a(a, á, à, ǎ, â)")
Exemplo n.º 24
0
 def test_initial_char_in_allophones(self):
     # see test_regression.test_s_COMBINING_INVERTED_BRIDGE_BELOW_allophone
     # and test_regression.test_galician
     t = Token("b(b̥, β̞)")
     assert len(t.allophones) == 2
     assert "b̥" in t.allophones
     assert "β̞" in t.allophones
     assert len(t.variants) == 3
     assert "b" in t.variants
     assert "β̞" in t.variants
     assert "b̥" in t.variants
Exemplo n.º 25
0
    def test_inventory_and_orthography_token_matches(self):
        # The orthographies tend to be underspecified e.g.:
        # p(p, pʰ, pʷ) is in the inventory but the orthography has
        # <p> = /p/
        # ... so we should first match the full and if not found then see
        # if we match the short form.
        assert 'p' in self.reader.get_variants()
        # ... and in orthography
        assert Ortheme('<p> = /p(p, pʰ, pʷ)/') in self.reader.orthography

        #...and in variants
        assert Token('p(p, pʰ, pʷ)') == self.reader.get_variants()['p']
Exemplo n.º 26
0
    def test_combining_in_others(self):
        # Setswana's 'bó̝kɔ̝̀ːnì' was being parsed as:
        #
        # 8.    b                                      consonant
        # 9.    o̝(o̝, ò̝, ó̝, ô̝, ǒ̝)                 vowel
        # 10.    k                                      consonant
        # 11.    ɔ̝̀ː                                   missing       *
        # 12.    n                                      consonant
        # 13.    i(i, ì, í, î, ǐ, ì̞, í̞)               vowel
        #
        # i.e. in token 11 the combining character of double triangle "ː" is
        # merged to the character 'ɔ̝̀'. 'ɔ̝̀' is IN the inventory, but 'ɔ̝̀ː' is NOT
        # so this gets flagged as an error. "ː" is in other symbols and is
        # currently not being recognized as such

        f = FileReader()
        f.data['consonants'] = f.parse_inventory("b, k, n", 'consonant')
        f.data['vowels'] = f.parse_inventory(
            "o̝(o̝, ò̝, ó̝, ô̝, ǒ̝), i(i, ì, í, î, ǐ, ì̞, í̞)", 'vowel')
        f.known_missings.update(f.parse_list(['/ɔ̝̀/']))
        f.other_symbols.update(f.parse_inventory('ː', 'other'))
        # Other: ː
        transcript = 'bó̝kɔ̝̀ːnì'
        transcript = f.standardise(transcript)
        parsed = f.parse_transcript(transcript)
        assert parsed[0] == Token("b"), parsed
        assert parsed[1] == Token("o̝(o̝, ò̝, ó̝, ô̝, ǒ̝)"), parsed
        assert parsed[2] == Token("k"), parsed
        assert parsed[3] == MissingToken("ɔ̝̀"), parsed
        assert parsed[4] == Token("ː"), parsed
        assert parsed[5] == Token("n"), parsed
        assert parsed[6] == Token("i(i, ì, í, î, ǐ, ì̞, í̞)"), parsed
Exemplo n.º 27
0
    def test_basaa_ignored_superscript_n(self):
        # gáː ⁿbɛ̀βí being parsed as
        #
        # 9.     h                       consonant
        # 10.    a(a, á, à, ǎ, â)    vowel
        # 11.    ŋ(ŋ, ŋ́, ŋ̀)            consonant
        # 12.    g                       missing       *
        # 13.    aː(aː, áː, àː, ǎː, âː)    vowel
        # 14.     ⁿ                      missing       *
        # 15.    b                       missing       *
        # 16.    ɛ(ɛ, ɛ́, ɛ̀, ɛ̌, ɛ̂)    vowel
        # 17.    β                       consonant
        # 18.    i(i, í, ì, ǐ, î)    vowel
        #
        # i.e. 14 should be combined with 15 = ⁿb
        f = FileReader()
        f.data['consonants'] = f.parse_inventory("gʷ, ⁿb, ⁿg, β", 'consonant')
        f.data['vowels'] = f.parse_inventory(
            """
            a(a, á, à, ǎ, â), aː(aː, áː, àː, ǎː, âː),
            e(e, é, è, ě, ê), ɛ(ɛ, ɛ́, ɛ̀, ɛ̌, ɛ̂),
            i(i, í, ì, ǐ, î),
            """, 'vowels')
        transcript = f.standardise('gáː ⁿbɛ̀βí')
        parsed = f.parse_transcript(transcript)

        assert parsed[0] == MissingToken("g")  # known missing
        assert parsed[1] == Token("aː(aː, áː, àː, ǎː, âː)")
        assert parsed[2] == Token(" ")  # was incorrect -- should be SPACE.
        assert parsed[3] == Token("ⁿb")  # was incorrect
        assert parsed[4] == Token("ɛ(ɛ, ɛ́, ɛ̀, ɛ̌, ɛ̂)")
        assert parsed[5] == Token("β")
        assert parsed[6] == Token("i(i, í, ì, ǐ, î)")
Exemplo n.º 28
0
    def test_overmatching(self):
        # this was being parsed as:
        #  [<h>, <ao>, <MissingToken: ̯>, <a>]
        # .. should be:
        #  [<h>, <a>, <o ̯>, <a>]

        # think this is only a problem where the full inventory
        # when a word is encountered in the form of:
        # 123
        # and the tokens "1", "12" and "23" exist.
        f = FileReader()
        f.data['consonants'] = f.parse_inventory("h", 'consonant')
        f.data['vowels'] = f.parse_inventory(
            "i, e(e, e̯), ɜ, a, ɔ, o(o, o̯), u, ao", 'vowel')

        transcript = 'hao̯a'
        transcript = f.standardise(transcript)
        parsed = f.parse_transcript(transcript)
        assert parsed[0] == Token("h")
        assert parsed[1] == Token("a")
        assert parsed[2] == Token("o(o, o̯)")
        assert parsed[3] == Token("a")
Exemplo n.º 29
0
    def test_get_missing(self):
        # if missing char is in the known_missings, then it returns a MissingToken
        # with known_missing set to True
        x = self.reader.get_missing('x')
        assert x == MissingToken('x')
        assert x.known_missing == True

        # if missing char is in the default_tokens, then it returns a Token
        # with phoneme_type="default"
        dot = self.reader.get_missing(".")
        assert dot == Token('.')
        assert dot.is_missing == False
        assert dot.phoneme_type == 'default'

        # otherwise it's just Missing
        nine = self.reader.get_missing('9')
        assert nine == MissingToken('9')
        assert nine.known_missing == False
Exemplo n.º 30
0
 def test_upper_xumi(self):
     # an error with large other symbols being identified as single ones.
     # e.g. here "||" is being identified as two "|" i.e. "|", "|"
     f = FileReader()
     f.data['consonants'] = f.parse_inventory("l H", 'consonant')
     f.data['vowels'] = f.parse_inventory("i", 'vowels')
     f.known_missings.update(f.parse_list(["/|/", "/||/"]))
     transcript = f.standardise("li || H")
     parsed = f.parse_transcript(transcript)
     assert parsed[0] == Token("l")
     assert parsed[1] == Token("i")
     assert parsed[2] == Token(" ")
     assert parsed[3] == Token("||")
     assert parsed[4] == Token(" ")
     assert parsed[5] == Token("H")