def rule2(first, sec): ''' **It is a special rule** If 'न्' is there in sec , it got converted to 'ण्' , if there exist 'र्' or 'ष्' or'ऋ' in first, where elements of listA can be present in between them. ''' listA = [ '', 'अ', 'आ', 'इ', 'ई', 'उ', 'ऊ', 'ए', 'ऐ', 'ओ', 'औ', 'अं', 'अः', 'ह्', 'य्', 'व्', 'न्', 'क्', 'ख्', 'ग्', 'घ्', 'ङ्', 'प्', 'फ्', 'ब्', 'भ्', 'म्' ] sec_n = complete_tokenize(sec) first_r = complete_tokenize(first) first_r.reverse() for i in range(len(sec_n)): if sec_n[i] == 'न्': for item in first_r: if item in listA: continue elif item == 'र्' or item == 'ष्' or item == 'ऋ': sec_n[i] = 'ण्' break elif item in consonant: break elif sec_n[i] in consonant: break first_r.reverse() f = join(first_r) s = join(sec_n) return f, s
def stem(word): """ It inputs an inflected word and outputs the stem for that inflected word provided. In this function, first we make a trie of all available noun words, then taking tokenization of the inflected word and find any possible match in the trie, if it found an exact match in the declension of that matched noun word,then it would be our stem, else it would truncate the word and repeat the above step, until we get our desired result. """ lis = complete_tokenize(word) if ' ' in word: lis = lis[ 2:] #for vocative case , as they include 'हे' at initial, while searching it must be removed length = len(lis) for i in range(length): serch = lis[:length - i] for trie_word in mytrie.find(serch): joined_word = join(trie_word) if search_noun(word, joined_word) == True: found = True return joined_word
def Declension(word, gender=''): ''' It is the main method which produce Declension of any noun word provided, its gender. ''' cases = [ 'प्रथमा', 'द्वितीया', 'तृतीया', 'चर्तुथी', 'पन्चमी', 'षष्ठी', 'सप्तमी', 'सम्बोधन' ] if word in words_tagging.unique: Dec = eval(word) else: special_stem = ['as_stem_', 'an_stem_', 'en_stem_'] stem_c = stem_class(word) stem_type = stem_c + gender word_list = complete_tokenize(word) if stem_c in special_stem: print("special") word_prefix = word_list[:-2] else: word_prefix = word_list[:-1] word = join(word_prefix) if stem_type not in words_tagging.dict_noun.values(): stem_type = stem_type + '_1' Dec = [] for row in range(8): case = [] for col in range(3): if row == 7: case.append('हे ' + sandhi(word, eval(stem_type)[row][col])) else: case.append(sandhi(word, eval(stem_type)[row][col]))
def Declension_noun(word): ''' This function inputs only the noun word(no gender is required) present in our database,and it returns the declension of the noun word by adding suffix sequence(from Declension_noun_form module) to the noun.''' if word in words_tagging.unique: return eval(word) else: special_stem = ['as_stem_', 'an_stem_', 'en_stem_'] stem_c = stem_class(word) gen = gender(word) stem_type = stem_c + gen word_list = complete_tokenize(word) if stem_c in special_stem: word_prefix = word_list[:-2] else: word_prefix = word_list[:-1] word = join(word_prefix) decl = [] for row in range(8): case = [] for col in range(3): if row == 7: case.append('हे ' + sandhi(word, eval(stem_type)[row][col])) else: case.append(sandhi(word, eval(stem_type)[row][col]))
def sandhi(first, second): f = complete_tokenize(first) first, second = rule2(first, second) if f[-1] in consonant: return rule1(first, second) else: return rule0(first, second)
def rule1(first, second): ''' suchtiv sandhi: {'स्', 'त्', 'थ्', 'द्', 'ध्', 'न्'} converts to {'श्','च्', 'छ्','ज्', 'झ्', 'ञ्'} respectively , if element of first list comes in any of the two words, and element from second list comes in the other word at the point of concatenation ''' f = complete_tokenize(first) s = complete_tokenize(second) f_last = f[-1] s_start = s[0] listB = { 'स्': 'श्', 'त्': 'च्', 'थ्': 'छ्', 'द्': 'ज्', 'ध्': 'झ्', 'न्': 'ञ्' } if s_start in listB.keys() and f_last in listB.values(): s[0] = listB[s_start] elif s_start in listB.values() and f_last in listB.keys(): f[-1] = listB[f_last] result = f + s return join(result)
def initialize(): mytrie = Trie.Trie() for stem_cls in words_tagging.all_noun: for noun in stem_cls: mytrie.insert(complete_tokenize(noun)) return mytrie
def rule0(first, second): """ It contains swar sandhi(स्वर सन्धि)rules which is categorized as: ( 1.)दीर्घ सन्धि ( 2.)गुण सन्धि ( 3.)वृद्धि सन्धि ( 4.)यण् सन्धि ( 5.) अयादि सन्धि for detailed information visit: https://hi.wikipedia.org/wiki/संधि_(व्याकरण) """ f = complete_tokenize(first) s = complete_tokenize(second) f_last = f[-1] s_start = s[0] f.pop() s.pop(0) add = [] if f_last in ['अ', 'आ'] and s_start in ['अ', 'आ']: add = ['आ'] elif f_last in ['इ', 'ई'] and s_start in ['इ', 'ई']: add = ['ई'] elif f_last in ['उ', 'ऊ'] and s_start in ['उ', 'ऊ']: add = ['ऊ'] elif f_last in ['ऋ'] and s_start in ['ऋ']: add = ['ऋ'] elif f_last in ['अ', 'आ'] and s_start in ['इ', 'ई']: add = ['ए'] elif f_last in ['अ', 'आ'] and s_start in ['उ', 'ऊ']: add = ['ओ'] elif f_last in ['अ', 'आ'] and s_start in ['ऋ']: add = ['अ', 'र्'] elif f_last in ['अ', 'आ'] and s_start in ['ऌ']: add = ['अ', 'ल्'] elif f_last in ['अ', 'आ'] and s_start in ['ए', 'ऐ']: add = ['ऐ'] elif f_last in ['अ', 'आ'] and s_start in ['ओ', 'औ']: add = ['औ'] elif f_last in ['इ', 'ई' ] and s_start in ['अ', 'आ', 'उ', 'ऊ', 'ए', 'ऐ', 'ओ', 'औ']: add = ['य्', s_start] elif f_last in ['उ', 'ऊ' ] and s_start in ['अ', 'आ', 'इ', 'ई', 'ए', 'ऐ', 'ओ', 'औ']: add = ['ऊ', s_start] elif f_last in ['ऋ'] and s_start in [ 'अ', 'आ', 'इ', 'ई', 'उ', 'ऊ', 'ए', 'ऐ', 'ओ', 'औ' ]: add = ['र्', s_start] elif f_last in ['ऌ'] and s_start in [ 'अ', 'आ', 'इ', 'ई', 'उ', 'ऊ', 'ए', 'ऐ', 'ओ', 'औ' ]: add = ['ल्', s_start] elif f_last in ['ए'] and s_start in [ 'अ', 'आ', 'इ', 'ई', 'उ', 'ऊ', 'ए', 'ऐ', 'ओ', 'औ' ]: add = ['अ', 'य्', s_start] elif f_last in ['ऐ'] and s_start in [ 'अ', 'आ', 'इ', 'ई', 'उ', 'ऊ', 'ए', 'ऐ', 'ओ', 'औ' ]: add = ['अा', 'य्', s_start] elif f_last in ['ओ'] and s_start in [ 'अ', 'आ', 'इ', 'ई', 'उ', 'ऊ', 'ए', 'ऐ', 'ओ', 'औ' ]: add = ['अ', 'व्', s_start] elif f_last in ['औ'] and s_start in [ 'अ', 'आ', 'इ', 'ई', 'उ', 'ऊ', 'ए', 'ऐ', 'ओ', 'औ' ]: add = ['आ', 'व्', s_start] else: add = [f_last, s_start] result = f + add + s return join(result)