def orthographic_syllabify(word,lang): p_vectors=[si.get_phonetic_feature_vector(c,lang) for c in word] syllables=[] for i in xrange(len(word)): v=p_vectors[i] syllables.append(word[i]) ### simplified syllabification #if i+1<len(word) and \ # (not si.is_valid(p_vectors[i+1]) or si.is_misc(p_vectors[i+1])): # syllables.append(u' ') #elif not si.is_valid(v) or si.is_misc(v) or si.is_vowel(v): # syllables.append(u' ') #elif i+1<len(word) and \ # (si.is_consonant(v) or si.is_nukta(v)) and \ # (si.is_consonant(p_vectors[i+1]) or si.is_anusvaar(p_vectors[i+1])): # syllables.append(u' ') #### better syllabification if i+1<len(word) and (not si.is_valid(p_vectors[i+1]) or si.is_misc(p_vectors[i+1])): syllables.append(u' ') elif not si.is_valid(v) or si.is_misc(v) : syllables.append(u' ') elif si.is_vowel(v): anu_nonplos= ( i+2<len(word) and \ si.is_anusvaar(p_vectors[i+1]) and \ not si.is_plosive(p_vectors[i+2])\ ) anu_eow= ( i+2==len(word) and \ si.is_anusvaar(p_vectors[i+1]) ) if not(anu_nonplos or anu_eow): syllables.append(u' ') elif i+1<len(word) and \ (si.is_consonant(v) or si.is_nukta(v)): if si.is_consonant(p_vectors[i+1]): syllables.append(u' ') elif si.is_vowel(p_vectors[i+1]) and \ not si.is_dependent_vowel(p_vectors[i+1]): syllables.append(u' ') elif si.is_anusvaar(p_vectors[i+1]): anu_nonplos= ( i+2<len(word) and \ not si.is_plosive(p_vectors[i+2])\ ) anu_eow= i+2==len(word) if not(anu_nonplos or anu_eow): syllables.append(u' ') return u''.join(syllables).strip().split(u' ')
def orthographic_syllabify(word, lang): p_vectors = [si.get_phonetic_feature_vector(c, lang) for c in word] syllables = [] for i in range(len(word)): v = p_vectors[i] syllables.append(word[i]) ### simplified syllabification #if i+1<len(word) and \ # (not si.is_valid(p_vectors[i+1]) or si.is_misc(p_vectors[i+1])): # syllables.append(u' ') #elif not si.is_valid(v) or si.is_misc(v) or si.is_vowel(v): # syllables.append(u' ') #elif i+1<len(word) and \ # (si.is_consonant(v) or si.is_nukta(v)) and \ # (si.is_consonant(p_vectors[i+1]) or si.is_anusvaar(p_vectors[i+1])): # syllables.append(u' ') #### better syllabification if i + 1 < len(word) and (not si.is_valid(p_vectors[i + 1]) or si.is_misc(p_vectors[i + 1])): syllables.append(' ') elif not si.is_valid(v) or si.is_misc(v): syllables.append(' ') elif si.is_vowel(v): anu_nonplos= ( i+2<len(word) and \ si.is_anusvaar(p_vectors[i+1]) and \ not si.is_plosive(p_vectors[i+2])\ ) anu_eow= ( i+2==len(word) and \ si.is_anusvaar(p_vectors[i+1]) ) if not (anu_nonplos or anu_eow): syllables.append(' ') elif i+1<len(word) and \ (si.is_consonant(v) or si.is_nukta(v)): if si.is_consonant(p_vectors[i + 1]): syllables.append(' ') elif si.is_vowel(p_vectors[i+1]) and \ not si.is_dependent_vowel(p_vectors[i+1]): syllables.append(' ') elif si.is_anusvaar(p_vectors[i + 1]): anu_nonplos= ( i+2<len(word) and \ not si.is_plosive(p_vectors[i+2])\ ) anu_eow = i + 2 == len(word) if not (anu_nonplos or anu_eow): syllables.append(' ') return ''.join(syllables).strip().split(' ')
def orthographic_syllabify_improved(word,lang): word_mask=['0']*len(word) if lang=='ml': word, word_mask = normalize_malayalam(word) word=word elif lang=='pa': word, word_mask = normalize_punjabi(word) p_vectors=[si.get_phonetic_feature_vector(c,lang) for c in word] syllables=[] syllables_mask=[] for i in xrange(len(word)): v=p_vectors[i] syllables.append(word[i]) syllables_mask.append(word_mask[i]) ### simplified syllabification #if i+1<len(word) and \ # (not si.is_valid(p_vectors[i+1]) or si.is_misc(p_vectors[i+1])): # syllables.append(u' ') # syllables_mask.append(u'0') #elif not si.is_valid(v) or si.is_misc(v) or si.is_vowel(v): # syllables.append(u' ') # syllables_mask.append(u'0') #elif i+1<len(word) and \ # (si.is_consonant(v) or si.is_nukta(v)) and \ # (si.is_consonant(p_vectors[i+1]) or si.is_anusvaar(p_vectors[i+1])): # syllables.append(u' ') # syllables_mask.append(u'0') #### better syllabification if i+1<len(word) and (not si.is_valid(p_vectors[i+1]) or si.is_misc(p_vectors[i+1])): syllables.append(u' ') syllables_mask.append(u'0') elif not si.is_valid(v) or si.is_misc(v) : syllables.append(u' ') syllables_mask.append(u'0') elif si.is_vowel(v): anu_nonplos= ( i+2<len(word) and \ si.is_anusvaar(p_vectors[i+1]) and \ not si.is_plosive(p_vectors[i+2])\ ) anu_eow= ( i+2==len(word) and \ si.is_anusvaar(p_vectors[i+1]) ) if not(anu_nonplos or anu_eow): syllables.append(u' ') syllables_mask.append(u'0') elif i+1<len(word) and \ (si.is_consonant(v) or si.is_nukta(v)): if si.is_consonant(p_vectors[i+1]): syllables.append(u' ') syllables_mask.append(u'0') elif si.is_vowel(p_vectors[i+1]) and \ not si.is_dependent_vowel(p_vectors[i+1]): syllables.append(u' ') syllables_mask.append(u'0') elif si.is_anusvaar(p_vectors[i+1]): anu_nonplos= ( i+2<len(word) and \ not si.is_plosive(p_vectors[i+2])\ ) anu_eow= i+2==len(word) if not(anu_nonplos or anu_eow): syllables.append(u' ') syllables_mask.append(u'0') syllables_mask=u''.join(syllables_mask) syllables=u''.join(syllables) #assert len(syllables_mask) == len(syllables) #assert syllables_mask.find('01') == -1 if syllables_mask.find('01') >= 0: print 'Warning' if lang=='ml': syllables = denormalize_malayalam(syllables,syllables_mask) elif lang=='pa': syllables = denormalize_punjabi(syllables,syllables_mask) return syllables.strip().split(u' ')
def orthographic_syllabify_improved(word, lang): word_mask = ['0'] * len(word) if lang == 'ml': word, word_mask = normalize_malayalam(word) word = word elif lang == 'pa': word, word_mask = normalize_punjabi(word) p_vectors = [si.get_phonetic_feature_vector(c, lang) for c in word] syllables = [] syllables_mask = [] for i in range(len(word)): v = p_vectors[i] syllables.append(word[i]) syllables_mask.append(word_mask[i]) ### simplified syllabification #if i+1<len(word) and \ # (not si.is_valid(p_vectors[i+1]) or si.is_misc(p_vectors[i+1])): # syllables.append(u' ') # syllables_mask.append(u'0') #elif not si.is_valid(v) or si.is_misc(v) or si.is_vowel(v): # syllables.append(u' ') # syllables_mask.append(u'0') #elif i+1<len(word) and \ # (si.is_consonant(v) or si.is_nukta(v)) and \ # (si.is_consonant(p_vectors[i+1]) or si.is_anusvaar(p_vectors[i+1])): # syllables.append(u' ') # syllables_mask.append(u'0') #### better syllabification if i + 1 < len(word) and (not si.is_valid(p_vectors[i + 1]) or si.is_misc(p_vectors[i + 1])): syllables.append(' ') syllables_mask.append('0') elif not si.is_valid(v) or si.is_misc(v): syllables.append(' ') syllables_mask.append('0') elif si.is_vowel(v): anu_nonplos= ( i+2<len(word) and \ si.is_anusvaar(p_vectors[i+1]) and \ not si.is_plosive(p_vectors[i+2])\ ) anu_eow= ( i+2==len(word) and \ si.is_anusvaar(p_vectors[i+1]) ) if not (anu_nonplos or anu_eow): syllables.append(' ') syllables_mask.append('0') elif i+1<len(word) and \ (si.is_consonant(v) or si.is_nukta(v)): if si.is_consonant(p_vectors[i + 1]): syllables.append(' ') syllables_mask.append('0') elif si.is_vowel(p_vectors[i+1]) and \ not si.is_dependent_vowel(p_vectors[i+1]): syllables.append(' ') syllables_mask.append('0') elif si.is_anusvaar(p_vectors[i + 1]): anu_nonplos= ( i+2<len(word) and \ not si.is_plosive(p_vectors[i+2])\ ) anu_eow = i + 2 == len(word) if not (anu_nonplos or anu_eow): syllables.append(' ') syllables_mask.append('0') syllables_mask = ''.join(syllables_mask) syllables = ''.join(syllables) #assert len(syllables_mask) == len(syllables) #assert syllables_mask.find('01') == -1 if syllables_mask.find('01') >= 0: print('Warning') if lang == 'ml': syllables = denormalize_malayalam(syllables, syllables_mask) elif lang == 'pa': syllables = denormalize_punjabi(syllables, syllables_mask) return syllables.strip().split(' ')