示例#1
0
from cltk.corpus.sanskrit.itrans.unicode_transliterate import ItransTransliterator
from cltk.tokenize.sentence import TokenizeSentence
from cltk.stem.sanskrit.indian_syllabifier import Syllabifier

lang = "hi"
language = "hindi"
tokenizer = TokenizeSentence("sanskrit")
syl = Syllabifier(language)

#List of phonemes that should not be counted as separate diphones while splitting
check_phonemes_1 = ["ः", "ऽ", "ङ्‍\u200d"]
check_phonemes_2 = ["\u200c"]

#List of characters that should be taken to the left in case they are present to the right while splitting
move_left_1 = ['म्', 'र्', 'न्']


#Checking for numbers and purna-viram
def check_token(token):
    flag = True
    if token == "।":
        flag = False
    elif token.isdigit():
        flag = False

    return flag


#Checking for splitting position
def check_proximity(split, pos, next_token):
    if len(split) - pos in range(1, 3):
示例#2
0
"""
@author: sourabh garg
"""
import itertools
import re
import words_tagging
from cltk.stem.sanskrit.indian_syllabifier import Syllabifier
from cltk.corpus.sanskrit.alphabet import *

lang = 'hindi'
h = Syllabifier(lang)

VOWELS = [
    INDEPENDENT_VOWELS_SIMPLE, INDEPENDENT_VOWELS_DIPTHONGS, INDEPENDENT_VOWELS
]

VOWELS = list(itertools.chain(*VOWELS))
CONSONANTS = [
    CONSONANT_GUTTURALS, CONSONANT_PALATALS, CONSONANT_CEREBRALS,
    CONSONANT_DENTALS, CONSONANT_LABIALS, SEMIVOWEL_CONSONANT,
    SIBILANT_CONSONANT, SONANT_ASPIRATE
]
CONSONANTS = list(itertools.chain(*CONSONANTS))
CONSONANT_HALANTA = [x + '्' for x in CONSONANTS]
CONS_TO_CONS = dict(zip(CONSONANT_HALANTA, CONSONANTS))
matraa_to_vowel = {
    '': 'अ',
    'ा': 'आ',
    'ि': 'इ',
    'ी': 'ई',
    'ु': 'उ',