예제 #1
0
def separate(string):
    """
        Separates a valid Vietnamese word into 3 components:
        the start sound, the middle sound and the end sound.
        Eg: toán -> [u't', u'oá', u't']
        Otherwise returns None (not a valid Vietnamese word).
    """
    comps = [u'', u'', u'']
    if string == u'':
        return comps

    # Search for the first vowel
    for i in range(len(string)):
        if utils.is_vowel(string[i]):
            comps[0] = u'' + string[:i]
            string = u'' + string[i:]
            break

    # No vowel?
    if comps[0] == u'' and not utils.is_vowel(string[0]):
        comps[0] = string
        string = u''

    # Search for the first consonant after the first vowel
    for i in range(len(string)):
        if not utils.is_vowel(string[i]):
            comps[1] = string[:i]
            comps[2] = string[i:]
            break

    # No ending consonant? Then the rest of the string must be the vowel part
    if comps[1] == u'':
        comps[1] = string

    # 'gi' and 'qu' need some special treatments
    # We want something like this:
    #     ['g', 'ia', ''] -> ['gi', 'a', '']
    if (comps[0] != u'' and comps[1] != u'') and \
    ((comps[0] in u'gG' and comps[1][0] in 'iI' and len(comps[1]) > 1) or \
    (comps[0] in u'qQ' and comps[1][0] in 'uU')):
        comps[0] += comps[1][:1]
        comps[1] = comps[1][1:]

    if not is_valid_combination(comps):
        return None
    return comps
예제 #2
0
def separate(string):
    """
        Separates a valid Vietnamese word into 3 components:
        the start sound, the middle sound and the end sound.
        Eg: toán -> [u't', u'oá', u't']
        Otherwise returns None (not a valid Vietnamese word).
    """
    comps = [u'', u'', u'']
    if string == u'':
        return comps
    
    # Search for the first vowel
    for i in range(len(string)):
        if utils.is_vowel(string[i]):
            comps[0] = u'' + string[:i]
            string = u'' + string[i:]
            break

    # No vowel?
    if comps[0] == u'' and not utils.is_vowel(string[0]):
        comps[0] = string
        string = u''
    
    # Search for the first consonant after the first vowel
    for i in range(len(string)):
        if not utils.is_vowel(string[i]):
            comps[1] = string[:i]
            comps[2] = string[i:]
            break
       
    # No ending consonant? Then the rest of the string must be the vowel part
    if comps[1] == u'':
        comps[1] = string
    
    # 'gi' and 'qu' need some special treatments
    # We want something like this:
    #     ['g', 'ia', ''] -> ['gi', 'a', '']
    if (comps[0] != u'' and comps[1] != u'') and \
    ((comps[0] in u'gG' and comps[1][0] in 'iI' and len(comps[1]) > 1) or \
    (comps[0] in u'qQ' and comps[1][0] in 'uU')):
        comps[0] += comps[1][:1]
        comps[1] = comps[1][1:]
    
    if not is_valid_combination(comps):
        return None
    return comps
예제 #3
0
    #
    # Note that when undo 'ư' with 'w', this function will always return
    # 'uw' because of lack of raw string information. It is up to the
    # user of this module to change the returned value to 'w' when necessary.
    #
    if new_comps == comps:
        for trans in trans_list:
            new_comps = reverse(new_comps, trans)
            tmp = list(new_comps)
            if tmp != comps:
                new_comps = utils.append_comps(new_comps, unicode(key))
                return garbage + utils.join(new_comps)
        new_comps = utils.append_comps(new_comps, unicode(key))

    # One last check to rule out cases like 'ảch' or 'chuyểnl'
    if not is_valid_combination(new_comps):
        return None
    return garbage + utils.join(new_comps)


def get_transformation_list(key, im, case=0):
    """
        Return list of transformations inferred from entered key.  The
        map between transform types and keys is given by module
        bogo_config (if exists) or by variable simple_telex_im

        if entered key is not in im, return u"+key", meaning appending
        the entered key to current text
    """
    if key in im:
        lkey = key
예제 #4
0
def process_key(string, key, case = 0, config = DefaultConfig()):
    """
    Process the given string and key based on the given input method and
    config.
    
    Args:
        string -
        key -
        case (optional) - Force the output's case. Mostly to determine 
            the case of TELEX's [, ] keys. 0: lower, 1: upper. Default: 0.
        im (optional) - one of 'telex', 'simple-telex', 'vni'.
            Default: 'telex'.
        config - a dictionary.
    """
    ## BEGIN TRICKS (scroll down please)
    im = config.input_method
    # People can sometimes be really mischievous :<
    if im in IMs:
        im = IMs[im]
    else:
        im = IMs['telex']

    # Handle non-alpha string like 'tôi_là_ai' by putting 'tôi_là_' in 
    # the `garbage` variable, effectively skipping it then put it back 
    # later.
    # TODO Should this be the ibus engine's job?
    garbage = u''
    for i in range(-1, -len(string)-1, -1): # Reverse indices [-1, -2, -3, ...]
        if not string[i].isalpha():
            garbage += string[:i] + string[i]
            string = u'' + string[i+1:] if i != -1 else u''
            break
    
    # Handle process_key('â', '_')
    if not key in im and not key.isalpha():
        string += key
        return garbage + string
    
    ## END TRICKS (here comes real code)
    
    # Try to break the string down to 3 components
    # separate('chuyen') = ['ch', 'uye', 'n']
    comps = separate(string)
    
    # Refuse to process things like process('zzam', 'f')
    if comps == None:
        return None
    
    # Apply transformations
    trans_list = get_transformation_list(key, im, case = case);
    new_comps = comps

    for trans in trans_list:
        new_comps = transform(new_comps, trans)

    # Double typing an IM key to undo.
    # Eg: process_key(u'à', 'f')
    #  -> transform(['', u'à', ''], '\\') = ['', 'à', '']
    #  -> reverse(u'à', '\\') = 'a'
    #
    # Note that when undo 'ư' with 'w', this function will always return
    # 'uw' because of lack of raw string information. It is up to the
    # user of this module to change the returned value to 'w' when necessary.
    # 
    if new_comps == comps:
        for trans in trans_list:
            new_comps = reverse(new_comps, trans)
            tmp = list(new_comps)
            if tmp != comps:
                new_comps = utils.append_comps(new_comps, unicode(key))
                return garbage + utils.join(new_comps)
        new_comps = utils.append_comps(new_comps, unicode(key))
        
    # One last check to rule out cases like 'ảch' or 'chuyểnl'
    if not is_valid_combination(new_comps):
        return None
    return garbage + utils.join(new_comps)