def separate(string): """ Separates a valid Vietnamese word into 3 components: the start sound, the middle sound and the end sound. Eg: toán -> [u't', u'oá', u't'] Otherwise returns None (not a valid Vietnamese word). """ comps = [u'', u'', u''] if string == u'': return comps # Search for the first vowel for i in range(len(string)): if utils.is_vowel(string[i]): comps[0] = u'' + string[:i] string = u'' + string[i:] break # No vowel? if comps[0] == u'' and not utils.is_vowel(string[0]): comps[0] = string string = u'' # Search for the first consonant after the first vowel for i in range(len(string)): if not utils.is_vowel(string[i]): comps[1] = string[:i] comps[2] = string[i:] break # No ending consonant? Then the rest of the string must be the vowel part if comps[1] == u'': comps[1] = string # 'gi' and 'qu' need some special treatments # We want something like this: # ['g', 'ia', ''] -> ['gi', 'a', ''] if (comps[0] != u'' and comps[1] != u'') and \ ((comps[0] in u'gG' and comps[1][0] in 'iI' and len(comps[1]) > 1) or \ (comps[0] in u'qQ' and comps[1][0] in 'uU')): comps[0] += comps[1][:1] comps[1] = comps[1][1:] if not is_valid_combination(comps): return None return comps
# # Note that when undo 'ư' with 'w', this function will always return # 'uw' because of lack of raw string information. It is up to the # user of this module to change the returned value to 'w' when necessary. # if new_comps == comps: for trans in trans_list: new_comps = reverse(new_comps, trans) tmp = list(new_comps) if tmp != comps: new_comps = utils.append_comps(new_comps, unicode(key)) return garbage + utils.join(new_comps) new_comps = utils.append_comps(new_comps, unicode(key)) # One last check to rule out cases like 'ảch' or 'chuyểnl' if not is_valid_combination(new_comps): return None return garbage + utils.join(new_comps) def get_transformation_list(key, im, case=0): """ Return list of transformations inferred from entered key. The map between transform types and keys is given by module bogo_config (if exists) or by variable simple_telex_im if entered key is not in im, return u"+key", meaning appending the entered key to current text """ if key in im: lkey = key
def process_key(string, key, case = 0, config = DefaultConfig()): """ Process the given string and key based on the given input method and config. Args: string - key - case (optional) - Force the output's case. Mostly to determine the case of TELEX's [, ] keys. 0: lower, 1: upper. Default: 0. im (optional) - one of 'telex', 'simple-telex', 'vni'. Default: 'telex'. config - a dictionary. """ ## BEGIN TRICKS (scroll down please) im = config.input_method # People can sometimes be really mischievous :< if im in IMs: im = IMs[im] else: im = IMs['telex'] # Handle non-alpha string like 'tôi_là_ai' by putting 'tôi_là_' in # the `garbage` variable, effectively skipping it then put it back # later. # TODO Should this be the ibus engine's job? garbage = u'' for i in range(-1, -len(string)-1, -1): # Reverse indices [-1, -2, -3, ...] if not string[i].isalpha(): garbage += string[:i] + string[i] string = u'' + string[i+1:] if i != -1 else u'' break # Handle process_key('â', '_') if not key in im and not key.isalpha(): string += key return garbage + string ## END TRICKS (here comes real code) # Try to break the string down to 3 components # separate('chuyen') = ['ch', 'uye', 'n'] comps = separate(string) # Refuse to process things like process('zzam', 'f') if comps == None: return None # Apply transformations trans_list = get_transformation_list(key, im, case = case); new_comps = comps for trans in trans_list: new_comps = transform(new_comps, trans) # Double typing an IM key to undo. # Eg: process_key(u'à', 'f') # -> transform(['', u'à', ''], '\\') = ['', 'à', ''] # -> reverse(u'à', '\\') = 'a' # # Note that when undo 'ư' with 'w', this function will always return # 'uw' because of lack of raw string information. It is up to the # user of this module to change the returned value to 'w' when necessary. # if new_comps == comps: for trans in trans_list: new_comps = reverse(new_comps, trans) tmp = list(new_comps) if tmp != comps: new_comps = utils.append_comps(new_comps, unicode(key)) return garbage + utils.join(new_comps) new_comps = utils.append_comps(new_comps, unicode(key)) # One last check to rule out cases like 'ảch' or 'chuyểnl' if not is_valid_combination(new_comps): return None return garbage + utils.join(new_comps)