from cltk.corpus.sanskrit.itrans.unicode_transliterate import ItransTransliterator from cltk.tokenize.sentence import TokenizeSentence from cltk.stem.sanskrit.indian_syllabifier import Syllabifier lang = "hi" language = "hindi" tokenizer = TokenizeSentence("sanskrit") syl = Syllabifier(language) #List of phonemes that should not be counted as separate diphones while splitting check_phonemes_1 = ["ः", "ऽ", "ङ्\u200d"] check_phonemes_2 = ["\u200c"] #List of characters that should be taken to the left in case they are present to the right while splitting move_left_1 = ['म्', 'र्', 'न्'] #Checking for numbers and purna-viram def check_token(token): flag = True if token == "।": flag = False elif token.isdigit(): flag = False return flag #Checking for splitting position def check_proximity(split, pos, next_token): if len(split) - pos in range(1, 3):
""" @author: sourabh garg """ import itertools import re import words_tagging from cltk.stem.sanskrit.indian_syllabifier import Syllabifier from cltk.corpus.sanskrit.alphabet import * lang = 'hindi' h = Syllabifier(lang) VOWELS = [ INDEPENDENT_VOWELS_SIMPLE, INDEPENDENT_VOWELS_DIPTHONGS, INDEPENDENT_VOWELS ] VOWELS = list(itertools.chain(*VOWELS)) CONSONANTS = [ CONSONANT_GUTTURALS, CONSONANT_PALATALS, CONSONANT_CEREBRALS, CONSONANT_DENTALS, CONSONANT_LABIALS, SEMIVOWEL_CONSONANT, SIBILANT_CONSONANT, SONANT_ASPIRATE ] CONSONANTS = list(itertools.chain(*CONSONANTS)) CONSONANT_HALANTA = [x + '्' for x in CONSONANTS] CONS_TO_CONS = dict(zip(CONSONANT_HALANTA, CONSONANTS)) matraa_to_vowel = { '': 'अ', 'ा': 'आ', 'ि': 'इ', 'ी': 'ई', 'ु': 'उ',