def concatenate_char_and_diacritization(ip_letters, nn_labels): nn_diacritized_letters = [] for ip_each_letter, each_nn_labels in zip(ip_letters, nn_labels): try: if each_nn_labels == 'space' or ip_each_letter == 'space': nn_diacritized_letters.append(ip_each_letter) elif each_nn_labels == 'pad' or ip_each_letter == 'pad': nn_diacritized_letters.append(ip_each_letter) else: if len(list(each_nn_labels)) > 1: nn_diacritized_letters.append(ip_each_letter + each_nn_labels) elif not unicodedata2.combining(each_nn_labels): nn_diacritized_letters.append(ip_each_letter) else: nn_diacritized_letters.append(ip_each_letter + each_nn_labels) except: c = 1 return nn_diacritized_letters
def correct_alef_prev_char_normal_case_version_2(letter): overall = "" comp = "" is_corrected = False for c in letter: if not unicodedata2.combining(c): overall = c comp = unicodedata2.normalize('NFC', c) elif c == u'َ' or c == u'ّ' or c == u'ً': overall += c comp = unicodedata2.normalize('NFC', overall) is_corrected = True else: c = u'َ' overall += c comp = unicodedata2.normalize('NFC', overall) is_corrected = True if not is_corrected: c = u'َ' overall += c comp = unicodedata2.normalize('NFC', overall) return comp
def correct_alef_maksora_prev_char_normal_case_version_2(letter): overall = "" comp = "" is_corrected = False try: for c in letter: if not unicodedata2.combining(c): overall = c comp = unicodedata2.normalize('NFC', c) elif c == u'َ' or c == u'ّ' or c == u'ً': overall += c comp = unicodedata2.normalize('NFC', overall) is_corrected = True else: c = u'َ' overall += c comp = unicodedata2.normalize('NFC', overall) is_corrected = True except: raise Exception("bug found in correct_alef_maksora_prev_char_normal_case") if not is_corrected: c = u'َ' overall += c comp = unicodedata2.normalize('NFC', overall) return comp
def teh_marbota_char_correction(char): overall = "" comp = "" is_corrected = False for c in char: if not unicodedata2.combining(c): overall = c comp = unicodedata2.normalize('NFC', c) elif c == u'َ' or c == u'ّ' or c == u'ً': overall += c comp = unicodedata2.normalize('NFC', overall) is_corrected = True else: c = u'َ' overall += c comp = unicodedata2.normalize('NFC', overall) is_corrected = True if not is_corrected: c = u'َ' overall += c comp = unicodedata2.normalize('NFC', overall) return comp
def __init__(self, name, features, characters, font): self.name = name self.features = features self.characters = characters self.combining = True if characters and uni.combining(characters[0]) else False self.key = self.characters + ''.join(features) self.font = font
def __init__(self, name, features, characters, font): self.name = name self.features = features self.characters = characters self.combining = True if characters and uni.combining( characters[0]) else False self.key = self.characters + ''.join(features) self.font = font self.index = self.font.ttfont.getGlyphID(name) self.width = self.font.ttfont['hmtx'].metrics[name][0]
def correct_alef_prev_char_mem(prev_char_object): overall = "" comp = "" is_corrected = False for c in prev_char_object.letter: if not unicodedata2.combining(c): overall = c comp = unicodedata2.normalize('NFC', c) else: c = u'ِ' overall += c comp = unicodedata2.normalize('NFC', overall) is_corrected = True if not is_corrected: c = u'ِ' overall += c comp = unicodedata2.normalize('NFC', overall) return comp
def correct_alef_prev_char_ba2_maksora_version_2(letter): overall = "" comp = "" is_corrected = False for c in letter: if not unicodedata2.combining(c): overall = c comp = unicodedata2.normalize('NFC', c) else: c = u'ِ' overall += c comp = unicodedata2.normalize('NFC', overall) is_corrected = True if not is_corrected: c = u'ِ' overall += c comp = unicodedata2.normalize('NFC', overall) return comp
def remove_diacritics(character): nkfd_form = unicodedata2.normalize('NFKD', str(character)) char = u"".join([c for c in nkfd_form if not unicodedata2.combining(c) or c == u'ٓ' or c == u'ٔ' or c == u'ٕ']) return char
if uniValue in languageMap: contextMatch = True context = languageMap[uniValue]["context"] if context: contextMatch = False ## After_I # The last preceding base character was # an uppercase I, and there is no inter- # vening combining character class 230. if context == "After_I": previous = None for otherUniValue in reversed(glyphs[:index]): previous = otherUniValue if isinstance(otherUniValue, basestring): break combining = unicodedata.combining(unichr(otherUniValue)) if combining == 230: previous = None break if combining == 0: break if previous == convertCodeToInt("0049"): contextMatch = True elif context == "Not_After_I": # not referenced in SpecialCasing raise NotImplementedError ## After_Soft_Dotted # The last preceding character with a # combining class of zero before C was # Soft_Dotted, and there is no interven- # ing combining character class 230