Exemplo n.º 1
0
if __name__ == "__main__":
    fileobj = open("source/sample.html", "r", encoding="utf_8")
    text = fileobj.read()
    fileobj.close()

    #形態素解析をしやすくするためのクリーニング
    tcleaner = TextCleaner.TextCleaner()
    text = tcleaner.remove_header(text)
    text = tcleaner.clean_html_and_js_tags(text)
    text = tcleaner.clean_url(text)
    text = tcleaner.clean_code(text)
    text = tcleaner.clean_text(text)
    tcleaner.output(text)

    tokenizer = Tokenizer.JanomeTokenizer()
    words = tokenizer.wakati(text)
    #words = tokenizer.filter_by_pos(text, pos=('名詞'))
    tokenizer.output(words)

    #MeCab
    #tokenizer = Tokenizer.MeCabTokenizer()
    #words = tokenizer.wakati(text)
    #words = tokenizer.filter_by_pos(text, pos=('名詞'))
    #tokenizer.output(words)

    tnormalizer = TextNormalizer.TextNormalizer()
    nwords = []
    for w in words:
        nw = tnormalizer.normalize(w)
        nw = tnormalizer.lemmatize_term(nw, pos='v')