Exemplo n.º 1
0
def concatenate_char_and_diacritization(ip_letters, nn_labels):
    nn_diacritized_letters = []

    for ip_each_letter, each_nn_labels in zip(ip_letters, nn_labels):

        try:
            if each_nn_labels == 'space' or ip_each_letter == 'space':
                nn_diacritized_letters.append(ip_each_letter)
            elif each_nn_labels == 'pad' or ip_each_letter == 'pad':
                nn_diacritized_letters.append(ip_each_letter)
            else:

                if len(list(each_nn_labels)) > 1:
                    nn_diacritized_letters.append(ip_each_letter +
                                                  each_nn_labels)

                elif not unicodedata2.combining(each_nn_labels):
                    nn_diacritized_letters.append(ip_each_letter)
                else:
                    nn_diacritized_letters.append(ip_each_letter +
                                                  each_nn_labels)
        except:
            c = 1

    return nn_diacritized_letters
Exemplo n.º 2
0
def correct_alef_prev_char_normal_case_version_2(letter):
    overall = ""
    comp = ""
    is_corrected = False
    for c in letter:
        if not unicodedata2.combining(c):
            overall = c
            comp = unicodedata2.normalize('NFC', c)

        elif c == u'َ' or c == u'ّ' or c == u'ً':
            overall += c
            comp = unicodedata2.normalize('NFC', overall)
            is_corrected = True

        else:
            c = u'َ'
            overall += c
            comp = unicodedata2.normalize('NFC', overall)
            is_corrected = True

    if not is_corrected:
        c = u'َ'
        overall += c
        comp = unicodedata2.normalize('NFC', overall)

    return comp
Exemplo n.º 3
0
def correct_alef_maksora_prev_char_normal_case_version_2(letter):
    overall = ""
    comp = ""
    is_corrected = False
    try:
        for c in letter:
            if not unicodedata2.combining(c):
                overall = c
                comp = unicodedata2.normalize('NFC', c)

            elif c == u'َ' or c == u'ّ' or c == u'ً':
                overall += c
                comp = unicodedata2.normalize('NFC', overall)
                is_corrected = True

            else:
                c = u'َ'
                overall += c
                comp = unicodedata2.normalize('NFC', overall)
                is_corrected = True
    except:
        raise Exception("bug found in correct_alef_maksora_prev_char_normal_case")

    if not is_corrected:
        c = u'َ'
        overall += c
        comp = unicodedata2.normalize('NFC', overall)

    return comp
Exemplo n.º 4
0
def teh_marbota_char_correction(char):
    overall = ""
    comp = ""
    is_corrected = False
    for c in char:
        if not unicodedata2.combining(c):
            overall = c
            comp = unicodedata2.normalize('NFC', c)

        elif c == u'َ' or c == u'ّ' or c == u'ً':
            overall += c
            comp = unicodedata2.normalize('NFC', overall)
            is_corrected = True

        else:
            c = u'َ'
            overall += c
            comp = unicodedata2.normalize('NFC', overall)
            is_corrected = True

    if not is_corrected:
        c = u'َ'
        overall += c
        comp = unicodedata2.normalize('NFC', overall)

    return comp
Exemplo n.º 5
0
 def __init__(self, name, features, characters, font):
     self.name = name
     self.features = features
     self.characters = characters
     self.combining = True if characters and uni.combining(characters[0]) else False
     self.key = self.characters + ''.join(features)
     self.font = font
Exemplo n.º 6
0
 def __init__(self, name, features, characters, font):
     self.name = name
     self.features = features
     self.characters = characters
     self.combining = True if characters and uni.combining(
         characters[0]) else False
     self.key = self.characters + ''.join(features)
     self.font = font
     self.index = self.font.ttfont.getGlyphID(name)
     self.width = self.font.ttfont['hmtx'].metrics[name][0]
Exemplo n.º 7
0
def correct_alef_prev_char_mem(prev_char_object):
    overall = ""
    comp = ""
    is_corrected = False
    for c in prev_char_object.letter:
        if not unicodedata2.combining(c):
            overall = c
            comp = unicodedata2.normalize('NFC', c)
        else:
            c = u'ِ'
            overall += c
            comp = unicodedata2.normalize('NFC', overall)
            is_corrected = True

    if not is_corrected:
        c = u'ِ'
        overall += c
        comp = unicodedata2.normalize('NFC', overall)
    return comp
Exemplo n.º 8
0
def correct_alef_prev_char_ba2_maksora_version_2(letter):
    overall = ""
    comp = ""
    is_corrected = False
    for c in letter:
        if not unicodedata2.combining(c):
            overall = c
            comp = unicodedata2.normalize('NFC', c)
        else:
            c = u'ِ'
            overall += c
            comp = unicodedata2.normalize('NFC', overall)
            is_corrected = True

    if not is_corrected:
        c = u'ِ'
        overall += c
        comp = unicodedata2.normalize('NFC', overall)

    return comp
Exemplo n.º 9
0
def remove_diacritics(character):
    nkfd_form = unicodedata2.normalize('NFKD', str(character))
    char = u"".join([c for c in nkfd_form if not unicodedata2.combining(c) or c == u'ٓ' or c == u'ٔ' or c == u'ٕ'])
    return char
Exemplo n.º 10
0
 if uniValue in languageMap:
     contextMatch = True
     context = languageMap[uniValue]["context"]
     if context:
         contextMatch = False
         ## After_I
         # The last preceding base character was
         # an uppercase I, and there is no inter-
         # vening combining character class 230.
         if context == "After_I":
             previous = None
             for otherUniValue in reversed(glyphs[:index]):
                 previous = otherUniValue
                 if isinstance(otherUniValue, basestring):
                     break
                 combining = unicodedata.combining(unichr(otherUniValue))
                 if combining == 230:
                     previous = None
                     break
                 if combining == 0:
                     break
             if previous == convertCodeToInt("0049"):
                 contextMatch = True
         elif context == "Not_After_I":
             # not referenced in SpecialCasing
             raise NotImplementedError
         ## After_Soft_Dotted
         # The last preceding character with a
         # combining class of zero before C was
         # Soft_Dotted, and there is no interven-
         # ing combining character class 230