def test_combining_in_others(self): # Setswana's 'bó̝kɔ̝̀ːnì' was being parsed as: # # 8. b consonant # 9. o̝(o̝, ò̝, ó̝, ô̝, ǒ̝) vowel # 10. k consonant # 11. ɔ̝̀ː missing * # 12. n consonant # 13. i(i, ì, í, î, ǐ, ì̞, í̞) vowel # # i.e. in token 11 the combining character of double triangle "ː" is # merged to the character 'ɔ̝̀'. 'ɔ̝̀' is IN the inventory, but 'ɔ̝̀ː' is NOT # so this gets flagged as an error. "ː" is in other symbols and is # currently not being recognized as such f = FileReader() f.data['consonants'] = f.parse_inventory("b, k, n", 'consonant') f.data['vowels'] = f.parse_inventory( "o̝(o̝, ò̝, ó̝, ô̝, ǒ̝), i(i, ì, í, î, ǐ, ì̞, í̞)", 'vowel') f.known_missings.update(f.parse_list(['/ɔ̝̀/'])) f.other_symbols.update(f.parse_inventory('ː', 'other')) # Other: ː transcript = 'bó̝kɔ̝̀ːnì' transcript = f.standardise(transcript) parsed = f.parse_transcript(transcript) assert parsed[0] == Token("b"), parsed assert parsed[1] == Token("o̝(o̝, ò̝, ó̝, ô̝, ǒ̝)"), parsed assert parsed[2] == Token("k"), parsed assert parsed[3] == MissingToken("ɔ̝̀"), parsed assert parsed[4] == Token("ː"), parsed assert parsed[5] == Token("n"), parsed assert parsed[6] == Token("i(i, ì, í, î, ǐ, ì̞, í̞)"), parsed
def test_ellipsis(self): # an error with ellipsis. [...] f = FileReader() f.data['consonants'] = f.parse_inventory("l n", 'consonant') f.data['vowels'] = f.parse_inventory("", 'vowels') f.known_missings.update(f.parse_list(["/[...]/"])) transcript = f.standardise("l [...] n") parsed = f.parse_transcript(transcript) assert parsed[0] == Token("l") assert parsed[1] == Token(" ") assert parsed[2] == Token("[...]") assert parsed[3] == Token(" ") assert parsed[4] == Token("n")
def test_upper_xumi(self): # an error with large other symbols being identified as single ones. # e.g. here "||" is being identified as two "|" i.e. "|", "|" f = FileReader() f.data['consonants'] = f.parse_inventory("l H", 'consonant') f.data['vowels'] = f.parse_inventory("i", 'vowels') f.known_missings.update(f.parse_list(["/|/", "/||/"])) transcript = f.standardise("li || H") parsed = f.parse_transcript(transcript) assert parsed[0] == Token("l") assert parsed[1] == Token("i") assert parsed[2] == Token(" ") assert parsed[3] == Token("||") assert parsed[4] == Token(" ") assert parsed[5] == Token("H")
class Test_Mambai(unittest.TestCase): def setUp(self): self.f = FileReader() self.f.data['consonants'] = self.f.parse_inventory( """ p, b, t, d, k, g(g, k̚, q̚, ɣ, ʁ), kp(kp, kpŋm), gb, ɓ(ɓ, ʔm̰, ʔɓ, ʔp), ɗ(ɗ, ʔn̰, ʔɗ, ʔl̰), m, n, ŋ, ⱱ̟, ɽ(ɽ, ɳ̆, r), f, v, s, z, h, j(j, ɲ), ʔj̰(ʔj̰, ʔɲ̰), w(w, ŋʷ), ʔw̰(ʔw̰, ʔŋ̰ʷ, ʔẁ̰), l(l, n), ʔ """, "consonant") self.f.data['vowels'] = self.f.parse_inventory( """ i(i, í, ì, î, ĭ̀, ĭ́, íʔḭ̆́), ĩ(ĩ, ĩ́, ĩ̀, ĩ̂), ḭ̃(ḭ̃, ḭ̃́, ḭ̃̀, ḭ̃̂), ḭ(ḭ, ḭ́, ḭ̀, ḭ̂, iʔḭ), iː(iː, íː, ìː, îː), ĩː(ĩː, ĩ́ː, ĩ̀ː, ĩ̂ː), iˤ(iˤ, íˤ, ìˤ, îˤ, eˤ, éˤ, èˤ, êˤ), ĩˤ(ĩˤ, ĩ́ˤ, ĩ̀ˤ, ĩ̂ˤ), ẽˤ(ẽˤ, ẽ́ˤ, ẽ̀ˤ, ẽ̂ˤ), e(e, é, è, ê), ḛ(ḛ, ḛ́, ḛ̀, ḛ̂, eʔḛ, èʔḛ̆), eː(e:, éː, èː, êː), ḛ̃(ḛ̃, ḛ̃́, ḛ̃̀, ḛ̃̂), a(a, á, à, â), ã(ã, ã́, ã̀, ã̂), a̰(a̰, á̰, ắ̰, à̰, â̰, aʔa̰, áʔằ̰, áʔắ̰), aː(aː, áː, àː, âː), ãː(ãː, ã́ː, ã̀ː, ã̂ː), aˤ(aˤ, áˤ, àˤ, âˤ), ãˤ(ãˤ, ã́ˤ, ã̀ˤ, ã̂ˤ), õˤ(õˤ, ṍˤ, õ̀ˤ, õ̂ˤ), ã̰(ã̰, ã̰́, ã̰̀, ã̰̂), o(o, ó, ò, ô, ŏ̀), o̰(o̰, ó̰, ò̰, ô̰, oʔo̰, óʔŏ̰́), oː(oː, óː, òː, ôː), õ̰(õ̰, ṍ̰, õ̰̀, õ̰̂), u(u, ú, ù, û), ũ(ũ, ṹ, ũ̀, ũ̂), ṵ(ṵ, ṵ́, ṵ̀, ṵ̂, uʔṵ, úʔṵ̆́, úʔṵ̆̀, ùʔṵ̆̀), uː(uː, úː, ùː, ûː), ũː(ũː, ṹː, ũ̀ː, ũ̂ː), uˤ(uˤ, úˤ, ùˤ, ûˤ, oˤ, óˤ, òˤ, ôˤ), ũˤ(ũˤ, ṹˤ, ũ̀ˤ, ũ̂ˤ), ṵ̃(ṵ̃, ṵ̃́, ṵ̃̀, ṵ̃̂) """, "vowel") self.f.known_missings.update(self.f.parse_list([ "/↗/", ])) def test_get_maximal(self): max_, store = self.f.get_maximal(['ó', 'ʔ', 'ẁ̰']) assert max_ == ['ó'] assert store == ['ʔ', 'ẁ̰'] def test_one(self): # parsed as #130. j(j, ɲ) consonant #131. o(o, ó, ò, ô, ŏ̀) vowel #132. ↗ other #133. óʔẁ̰ missing * #134. punctuation transcript = self.f.standardise("jó↗óʔẁ̰") parsed = self.f.parse_transcript(transcript) assert parsed[0] == Token("j(j, ɲ)") assert parsed[1] == Token("o(o, ó, ò, ô, ŏ̀)") assert parsed[2] == MissingToken("↗") assert parsed[3] == Token("o(o, ó, ò, ô, ŏ̀)") assert parsed[4] == Token("ʔw̰(ʔw̰, ʔŋ̰ʷ, ʔẁ̰)")