Exemplos de is_maori em Python, exemplos de reo_toolkit.is_maori em Python

Exemplo n.º 1

0

Exibir arquivo

Arquivo: encoders.py Projeto: TeHikuMedia/reo-toolkit

 def encode(self, text):
     text = self.preprocess(text, vowel_type=self.vowel_type)
     words = []
     for word in TreebankWordTokenizer().tokenize(text):
         from reo_toolkit import is_maori
         if not is_maori(word):
             words.append(word)
             continue
         encoded_text = []
         for syllable in self.tokenize(word):
             if not all(ch in alphabet for ch in syllable):
                 encoded_text.append(syllable)
                 continue
             if syllable in vowels:
                 syllable = 'x' + syllable
             try:
                 consonant, vowel = ''.join(
                     [self.encoder_dict[ch] for ch in syllable])
             except KeyError:
                 logging.error(
                     "KeyError: phoneme {} not in encoder_dict".format(
                         syllable))
                 raise KeyError
             try:
                 encoded = jamo.j2h(consonant, vowel)
             except jamo.InvalidJamoError:
                 logging.error(
                     'InvalidJamoError - Consonant={} Vowel={} Syllable={}'.
                     format(consonant, vowel, syllable))
             encoded_text.append(encoded)
         words.append(''.join(encoded_text))
     return TreebankWordDetokenizer().detokenize(words)

Exemplo n.º 2

0

Exibir arquivo

Arquivo: textutils.py Projeto: TeHikuMedia/nga-tautohetohe-reo

def tidy_text(text):
    paras = []
    for para in text.split("\n"):
        sents = []
        para = para.strip()
        for sent in sent_tokenize(para):
            tokens = word_tokenize(sent)
            words = sum(1 for token in tokens
                        if re.search('[A-zāēīōū]', token, re.IGNORECASE))
            nums = sum(1 for token in tokens if re.search('[0-9]', token))
            if nums > words:
                logging.debug(
                    'Rejected this sentence due to too many numbers: {}'.
                    format(sent))
                continue
            if is_maori(sent) and re.search('[a-zāēīōū]', sent, re.IGNORECASE):
                sents.append(sent)
        paras.append(' '.join(sents))

    text = '\n\n'.join(paras)
    return re.sub("\n{3,}", "\n\n", text)

Exemplo n.º 3

0

Exibir arquivo

def test_apostrophe():
    assert is_maori(
        "Ko 'Mā whero, mā pango, ka oti te mahi' ētahi o ngā whakatauki rongonui"
    )

Exemplo n.º 4

0

Exibir arquivo

def test_camel_case():
    assert is_maori("KeiTePai")
    assert not is_maori("MeToo", strict=False)

Exemplo n.º 5

0

Exibir arquivo

def test_many_vowels():
    assert is_maori("Papaoiea")

Exemplo n.º 6

0

Exibir arquivo

def test_macron_combining_character():
    """The unicode code point \u0304 is a combining character that adds a macron to the preceding letter"""
    assert is_maori('a\u0304'.encode('utf-8').decode())

Exemplo n.º 7

0

Exibir arquivo

def test_te_tiriti_o_waitangi():
    with open('data/te-tiriti-o-waitangi.txt', 'r') as f:
        transcript = f.read()
        assert is_maori(transcript, strict=True)

Exemplo n.º 8

0

Exibir arquivo

def test_okina():
    assert not is_maori(" ʻokina")

Exemplo n.º 9

0

Exibir arquivo

def test_māori_word():
    assert is_maori('Ko matou ko nga Tino Rangatira o nga iwi o Nu Tireni')

Exemplo n.º 10

0

Exibir arquivo

def test_cleaning():
    # This non-maori word gives a maori word 'i' after the non-maori characters are removed
    assert not is_maori("six")

Exemplo n.º 11

0

Exibir arquivo

def test_ambiguous_word():
    assert not is_maori('a', strict=False)
    assert is_maori('a', strict=True)

Exemplo n.º 12

0

Exibir arquivo

def test_non_maori_letter():
    assert not is_maori('z')

Exemplo n.º 13

0

Exibir arquivo

def test_ending_consonant():
    assert not is_maori('new')

Exemplo n.º 14

0

Exibir arquivo

def test_double_consonant():
    assert not is_maori('mmea')

Exemplo n.º 15

0

Exibir arquivo

def test_english_word():
    assert not is_maori('James Cooks')

Exemplo n.º 16

0

Exibir arquivo

def test_pacific_island():
    assert not is_maori("ma'unga")

Exemplo n.º 17

0

Exibir arquivo

def test_macron():
    assert is_maori('tohutō')

Exemplo n.º 18

0

Exibir arquivo

def test_hyphen():
    assert not is_maori('-maori')

Exemplo n.º 19

0

Exibir arquivo

def test_all_caps():
    assert is_maori('WHĀTUA')

Exemplo n.º 20

0

Exibir arquivo

def test_long_hyphenated_word():
    assert is_maori(
        'Taumatawhakatangi-hangakoauauotamatea-turipukakapikimaunga-horonukupokaiwhenua-kitanatahu'
    )

Exemplo n.º 21

0

Exibir arquivo

def test_he_whakaputanga():
    with open('data/he-whakaputanga.txt', 'r') as f:
        transcript = f.read()
        assert is_maori(transcript, strict=True)

Exemplo n.º 22

0

Exibir arquivo

def test_non_maori_word():
    assert not is_maori('tongue', strict=False)

Exemplo n.º 23

0

Exibir arquivo

def test_triple_vowel():
    assert not is_maori("teee")

Exemplo n.º 24

0

Exibir arquivo

def test_sentence():
    assert is_maori(
        "inā tatū te tai ka puare tēnei toka ka taea te haere mai i reira ki uta",
        strict=True)