Exemplo n.º 1
0
def get_tokens_simple(sentence):
    ''' Separete sentence to words tokens list (simple mode)
    Parameters
    ----------
    sentence : str
        1 line of Japanese example(sentence) ex. 私は走った

    Returns
    -------
    result : list of dict
    
    result[n]['surface'] : str,
    result[n]['base'] : str,
    result[n]['katakana'] : str,
    result[n]['wordclass'] : str,
      ex.
      result[0] = {'surface': '私',    'base': '私', 'katakana': 'ワタシ',  'wordclass': '代名詞'}
      result[1] = {'surface': 'は',    'base': 'は', 'katakana': 'ハ',      'wordclass': '助詞'}
      result[2] = {'surface': '走った','base': '走る','katakana': 'ハシッタ','wordclass': '動詞'}
    '''
    # Normalization
    sentence = sentence.translate(
        str.maketrans(const.SIMBOL_CONVERT_TO_ZENKAKU))
    tokens = Tokenizer().tokenize(sentence)
    # Normalization
    for t in tokens[:]:
        if t.reading.endswith('ッ'):
            # If tokens[i] is 'ハシッ' and tokens[i+1] is 'タ',
            # => tokens[i] will become 'ハシッタ', and delete tokens[i+1]
            # Because 'ハシッ' can't be converted to roman (it will be 'hashitsu')
            i = tokens.index(t)
            tokens[i].surface += tokens[i + 1].surface
            tokens[i].reading += tokens[i + 1].reading
            tokens.pop(i + 1)

    result = []
    for t in tokens:
        surface_str = t.surface
        base_str = t.base_form
        katakana_str = t.reading if t.reading != '*' else t.surface

        if const.DEBUG:
            print("DEBUG)", t.base_form, ":", t.part_of_speech)

        wordclass = t.part_of_speech.split(',')
        wordclass_str = wordclass[0]
        for class_chk_conv in const.WORD_CLASS_CONV:
            chk0 = class_chk_conv[0] is None or class_chk_conv[0] == wordclass[
                0]
            chk1 = class_chk_conv[1] is None or class_chk_conv[1] == wordclass[
                1]
            chk2 = class_chk_conv[2] is None or class_chk_conv[2] == wordclass[
                2]

            if all([chk0, chk1, chk2]):
                wordclass_str = class_chk_conv[3]
                break

        result.append({
            "surface": surface_str,
            "base": base_str,
            "katakana": katakana_str,
            "wordclass": wordclass_str,
        })

    return result