예제 #1
0
    def getNotVocalised(self, sents):
        tool = MyToolKit()
        not_vocalised_by_sents = []
        for i in range(len(sents)):
            not_vocalised = []
            words = tool.words(sents[i])
            j = 0
            while j < len(words):
                if not tool.HasDiac(words[j]):

                    string = ""
                    if j == 0: string = '###' + words[j]
                    else:
                        char = tool.LettersDiac(words[j - 1])
                        string = char[len(char) - 2] + char[len(char) -
                                                            1] + '#' + words[j]
                    k = j + 1
                    while k < len(words) and not tool.HasDiac(words[k]):
                        string += "#" + words[k]
                        k += 1
                    j += k
                    not_vocalised.append(string + "#")
                else:
                    j += 1
            not_vocalised_by_sents.append(not_vocalised)
        return not_vocalised_by_sents
예제 #2
0
    def LettersVocaliser(self, sents, smooth_const):
        tool = MyToolKit()
        hdb = DBHandler('data/model.db')
        hdb.connect()
        res = hdb.getFromTable('letters_dictionary',
                               attribute='type,vocabularies')
        dict = {}
        for r in res:
            dict[r[0]] = r[1]

        not_vocalised = self.getNotVocalised(sents)
        #parcourir les phrases
        for i in range(len(sents)):
            #pprint(not_vocalised[i])

            #parcourir les chaines non vocalisé dans une phrase
            for j in range(len(not_vocalised[i])):
                matrice = []
                char = tool.LettersDiac(not_vocalised[i][j])
                list_dict = []
                list_dict.append(
                    [-1, tool.HideChar(char[0], expect=['#', ' ']), 1])
                matrice.append(list_dict)

                list_dict = []
                list_dict.append(
                    [0, tool.HideChar(char[1], expect=['#', ' ']), 1])
                matrice.append(list_dict)

                list_dict = []
                list_dict.append([0, "#", 1])
                matrice.append(list_dict)

                string = tool.DeleteDiacritic(not_vocalised[i][j])
                #pprint(string)
                #parcourir les caractéres

                #if(string[0] == 'ا' and string[1] == 'ل')
                k = 3
                while k < len(string):
                    list_dict = []
                    if string[k - 1] == "#" and string[k] == 'ا' and string[
                            k + 1] == 'ل':
                        list_dict = []
                        list_dict.append([0, '_', 1])
                        matrice.append(list_dict)

                        list_dict = []
                        list_dict.append([0, '_ْ', 1])
                        matrice.append(list_dict)

                        k += 2
                    else:
                        #parcourir les possibilités
                        for possib in tool.words(dict[string[k]]):
                            if possib == "#": list_dict.append([-1, possib, 1])
                            else:
                                list_dict.append([
                                    -1,
                                    tool.HideChar(possib, expect=['#', ' ']), 0
                                ])
                        matrice.append(list_dict)
                        k += 1

                v = self.ViterbiLetter(matrice, smooth_const)
                string = self.alignLetter(v, not_vocalised[i][j])
                #pprint(string)
                #pprint(not_vocalised[i][j])
                #pprint(sents[i])
                #not_vocalised[i] = not_vocalised[i].replace(not_vocalised[i][j],string)
                sents[i] = sents[i].replace(
                    not_vocalised[i][j].replace('#', ' ').strip(),
                    string.replace('#', ' ').strip())
                """
                if v[:2] == "##" : n = 0
                else: n = 1
                
                string1 = re.sub('#+',' ',not_vocalised[i][j]).strip().split(' '))
                string2 = re.sub('#+',' ',v).strip().split(' ')
                pprint([0])
                pprint(string)
                pprint('------------')
                """
                #print(not_vocalised[i][j]+' => '+v)
                #exit()
                #pprint('---------------------------------')
                #pprint(matrice)
                """
                for a in matrice:
                    #matrice[i-1][k][1]
                    pprint(a[0][0])
            
                exit()
                """

        return sents