def get_tokens_simple(sentence): ''' Separete sentence to words tokens list (simple mode) Parameters ---------- sentence : str 1 line of Japanese example(sentence) ex. 私は走った Returns ------- result : list of dict result[n]['surface'] : str, result[n]['base'] : str, result[n]['katakana'] : str, result[n]['wordclass'] : str, ex. result[0] = {'surface': '私', 'base': '私', 'katakana': 'ワタシ', 'wordclass': '代名詞'} result[1] = {'surface': 'は', 'base': 'は', 'katakana': 'ハ', 'wordclass': '助詞'} result[2] = {'surface': '走った','base': '走る','katakana': 'ハシッタ','wordclass': '動詞'} ''' # Normalization sentence = sentence.translate( str.maketrans(const.SIMBOL_CONVERT_TO_ZENKAKU)) tokens = Tokenizer().tokenize(sentence) # Normalization for t in tokens[:]: if t.reading.endswith('ッ'): # If tokens[i] is 'ハシッ' and tokens[i+1] is 'タ', # => tokens[i] will become 'ハシッタ', and delete tokens[i+1] # Because 'ハシッ' can't be converted to roman (it will be 'hashitsu') i = tokens.index(t) tokens[i].surface += tokens[i + 1].surface tokens[i].reading += tokens[i + 1].reading tokens.pop(i + 1) result = [] for t in tokens: surface_str = t.surface base_str = t.base_form katakana_str = t.reading if t.reading != '*' else t.surface if const.DEBUG: print("DEBUG)", t.base_form, ":", t.part_of_speech) wordclass = t.part_of_speech.split(',') wordclass_str = wordclass[0] for class_chk_conv in const.WORD_CLASS_CONV: chk0 = class_chk_conv[0] is None or class_chk_conv[0] == wordclass[ 0] chk1 = class_chk_conv[1] is None or class_chk_conv[1] == wordclass[ 1] chk2 = class_chk_conv[2] is None or class_chk_conv[2] == wordclass[ 2] if all([chk0, chk1, chk2]): wordclass_str = class_chk_conv[3] break result.append({ "surface": surface_str, "base": base_str, "katakana": katakana_str, "wordclass": wordclass_str, }) return result