Exemplo n.º 1
0
def create_gensim_dictionary(data_path, no_below=2, no_above=0.1):

    for root, dirs, files in os.walk(data_path):
        print("# morphological analysis")
        docs = {}
        docs_title = {}
        for docname in files:
            docs[docname] = []
            with open(os.path.join(data_path, docname), "r") as f:
                lines = f.readlines()
                docs_title[docname] = lines[0]
                for text in lines:
                    text_replace = text.replace(" ",
                                                "").replace("\n", "").replace(
                                                    "#", "").replace("@", "")
                    if text_replace != "":
                        result = Jumanpp().analysis(text_replace)
                    for mrph in result.mrph_list():
                        if len(mrph.midasi) > 1:
                            docs[docname].append(mrph.midasi)

    dictionary = gensim.corpora.Dictionary(docs.values())
    dictionary.filter_extremes(no_below=no_below, no_above=no_above)

    return docs, docs_title, dictionary
Exemplo n.º 2
0
def split_into_words(text):
    '''記事を単語リストに変換する'''
    result = Jumanpp().analysis(text)
    return [mrph.midasi for mrph in result.mrph_list()]
Exemplo n.º 3
0
def split_into_words(text):

    result = Jumanpp().analysis(text)
    return [mrph.midasi for mrph in result.mrph_list()]
Exemplo n.º 4
0
def split_into_words(text):
    result = Jumanpp().analysis(text)
    return [mrph.midasi for mrph in result.mrph_list()]
Exemplo n.º 5
0
def morphological_analysis(doc):
    r = Jumanpp().analysis(doc)
    return [mrph.midasi for mrph in r.mrph_list()]