Exemplo n.º 1
0
def estimate(news_title):
    # 未知のデータ予測
    vec = dictionary.doc2bow(M.isMecab(news_title))
    print(vec)
    pre = list(matutils.corpus2dense([vec], num_terms=len(dictionary)).T[0])
    print(pre)
    label_predict = estimator.predict(pre)
    print (label_predict)
Exemplo n.º 2
0
def cleate_dic():
    ARTICLE_NAME = ["Computer","Entertainment","Sports","Science","Economy","World",'Politics','Society']
    ARTICLE = {"Computer":"","Entertainment":"","Sports":"","Science":"","Economy":"","World":"","Politics":"","Society":""}
    ret=[]
    noun = ""

    for n in ARTICLE_NAME:
        f = codecs.open('/Users/Soma/Onedrive/News_Dataset/article'+n+'.txt', 'r')
        ARTICLE[n]=f.readlines()
        f.close()
        for j in ARTICLE[n]:
            ret += M.isMecab2(j)

    preprocessed_docs = {}
    for name in ret:
        preprocessed = gensim.parsing.preprocess_string(noun)
        preprocessed_docs[noun] = preprocessed
        #print name, ":", preprocessed
    documents = corpora.Dictionary(ret)
    documents.save_as_text('noun_dic.txt')
Exemplo n.º 3
0
def cleate_lda_model():
    ARTICLE_NAME = ["Computer", "Entertainment", "Sports", "Science", "Economy", "World", "Politics", "Society"]
    ARTICLE = {
        "Computer": "",
        "Entertainment": "",
        "Sports": "",
        "Science": "",
        "Economy": "",
        "World": "",
        "Politics": "",
        "Society": "",
    }
    dictionary = corpora.Dictionary.load_from_text("noun_dic.txt")

    for n in ARTICLE_NAME:
        print "\n" + n + " LDA modl cleate..\n"
        f = codecs.open("/Users/Soma/Onedrive/News_Dataset/article" + n + ".txt", "r")
        ARTICLE[n] = f.readlines()
        f.close()
        data_train = [dictionary.doc2bow(M.isMecab(j)) for j in ARTICLE[n]]
        tfidf_corpus = gensim.corpora.MmCorpus("news_noun_" + n + ".mm")
        lda = models.LdaModel(corpus=tfidf_corpus, id2word=dictionary, num_topics=30)
        lda.save("model_" + n + ".lda")
Exemplo n.º 4
0
def train():
    ARTICLE_NAME = ["Computer","Entertainment","Sports","Science","Economy","World",'Politics','Society']
    ARTICLE = {"Computer":"","Entertainment":"","Sports":"","Science":"","Economy":"","World":"","Politics":"","Society":""}
    ret=[]
    data_train = [[] for row in range(8)]
    train_num = 0
    train_sum = 0


    for n in ARTICLE_NAME:
        data_range = countline('/Users/Soma/Onedrive/News_Dataset/article'+n+'.txt')
        print data_range
        train_sum += data_range
        for col in range(data_range):
            data_train[train_num].append(None)
        train_num += 1

    print "\nDataset : " + str(train_sum)
    label_train = [None for col in range(train_sum)]
    article_data = [None for col in range(train_sum)]
    cate_num =0
    label_num =0
    global dictionary
    dictionary = corpora.Dictionary.load_from_text('noun_dic.txt')

    for n in ARTICLE_NAME:
        num2 = 0
        f = codecs.open('/Users/Soma/Onedrive/News_Dataset/article'+n+'.txt', 'r')
        ARTICLE[n]=f.readlines()
        f.close()
        for j in ARTICLE[n]:
            tmp = dictionary.doc2bow(M.isMecab(j))
            data_train[cate_num][num2] = list(matutils.corpus2dense([tmp], num_terms=len(dictionary)).T[0])
            label_train[label_num] = n
            num2 +=1
            label_num += 1
        cate_num += 1

    ar_num = 0
    for n in data_train:
        for d in n:
            article_data[ar_num]=d
            ar_num = ar_num+1

    print "\ntrain start!"
    print "please wait..\n"

    global estimator

    """
    #学習器の最適化
    print "学習器最適化テスト開始"

    tuned_parameters = [{'n_estimators': [10, 30, 50, 70, 90, 110, 130, 150], 'max_features': ['auto', 'sqrt', 'log2', None]}]

    clf = GridSearchCV(RandomForestClassifier(), tuned_parameters, cv=2, scoring='accuracy', n_jobs=-1)
    clf.fit(article_data, label_train)

    print("ベストパラメタを表示")
    print(clf.best_estimator_)

    print("トレーニングデータでCVした時の平均スコア")
    for params, mean_score, all_scores in clf.grid_scores_:
        print("{:.3f} (+/- {:.3f}) for {}".format(mean_score, all_scores.std() / 2, params))

    y_true, y_pred = label_test_s, clf.predict(data_test_s)
    print(classification_report(y_true, y_pred))
    """
    # 学習させる
    estimator = RandomForestClassifier(n_estimators=train_sum/100)
    estimator.fit(article_data, label_train)

    print("\n==== 学習データと予測データが一緒の場合 ====")
    print(estimator.score(article_data, label_train))
Exemplo n.º 5
0
def train():
    ARTICLE_NAME = ["Computer","Entertainment","Sports","Science","Economy","World",'Politics','Society']
    ARTICLE = {"Computer":"","Entertainment":"","Sports":"","Science":"","Economy":"","World":"","Politics":"","Society":""}
    ret=[]
    data_train = [[] for row in range(8)]
    train_num = 0
    train_sum = 0


    for n in ARTICLE_NAME:
        data_range = countline('/Users/somatakei/Onedrive/News_Dataset/article'+n+'.txt')
        print data_range
        train_sum += data_range
        for col in range(data_range):
            data_train[train_num].append(None)
        train_num += 1

    print "\nDataset : " + str(train_sum)
    label_train = [None for col in range(train_sum)]
    article_data = [None for col in range(train_sum)]
    cate_num =0
    label_num =0
    global dictionary
    dictionary = corpora.Dictionary.load_from_text('noun_dic.txt')

    for n in ARTICLE_NAME:
        num2 = 0
        f = codecs.open('/Users/somatakei/Onedrive/News_Dataset/article'+n+'.txt', 'r')
        ARTICLE[n]=f.readlines()
        f.close()
        for j in ARTICLE[n]:
            tmp = dictionary.doc2bow(M.isMecab(j))
            data_train[cate_num][num2] = list(matutils.corpus2dense([tmp], num_terms=len(dictionary)).T[0])
            label_train[label_num] = n
            num2 +=1
            label_num += 1
        cate_num += 1

    ar_num = 0
    for n in data_train:
        for d in n:
            article_data[ar_num]=d
            ar_num = ar_num+1


    #dicをBoGの形で配列へ挿入
    print "---Bag of Words Corpus---"

    bow_docs = {}
    bow_docs_all_zeros = {}
    for name in names:

        sparse = dictionary.doc2bow(preprocessed_docs[name])
        bow_docs[name] = sparse
        dense = vec2dense(sparse, num_terms=len(dct))
        print name, ":", dense
        bow_docs_all_zeros[name] = all(d == 0 for d in dense)

    print "\nall zeros...\n", [name for name in bow_docs_all_zeros if bow_docs_all_zeros[name]]


    #LSIモデリング
    print "\nlsi modeling.."

    names = dictionary.keys()
    lsi_docs = {}
    num_topics = 2
    lsi_model = gensim.models.LsiModel(bow_docs.values(),id2word=dictionary,num_topics=num_topics)

    for name in names:

        vec = data_train[name]
        sparse = lsi_model[vec]
        dense = vec2dense(sparse, num_topics)
        lsi_docs[name] = sparse
        print name, ":", dense

    print "\nTopics"
    print lsi_model.print_topics()

    # 次元削減後のベクトルを正規化(ベクトルの方向が重要)
    print "\nunit vectorization.."

    unit_vecs = {}
    for name in names:

        vec = vec2dense(lsi_docs[name], num_topics)
        norm = sqrt(sum(num ** 2 for num in vec))
        unit_vec = [num / norm for num in vec]
        unit_vecs[name] = unit_vec
        print name, ":", unit_vec

    #SVMで学習させる
    print "\ntrain start!"
    print "please wait..\n"
    global estimator
    estimator = SVC()
    estimator.fit(article_data, label_train)

    print("\n==== 学習データと予測データが一緒の場合 ====")
    print(estimator.score(article_data, label_train))
Exemplo n.º 6
0
def train():
    ARTICLE_NAME = ["Computer", "Entertainment", "Sports", "Science", "Economy", "World", "Politics", "Society"]
    ARTICLE = {
        "Computer": "",
        "Entertainment": "",
        "Sports": "",
        "Science": "",
        "Economy": "",
        "World": "",
        "Politics": "",
        "Society": "",
    }
    ret = []
    data_train = [[] for row in range(8)]
    train_num = 0
    train_sum = 0

    for n in ARTICLE_NAME:
        data_range = countline("/Users/Soma/Onedrive/News_Dataset/article" + n + ".txt")
        print data_range
        train_sum += data_range
        for col in range(data_range):
            data_train[train_num].append(None)
        train_num += 1

    print "\nDataset : " + str(train_sum)
    label_train = [None for col in range(train_sum)]
    article_data = [None for col in range(train_sum)]
    cate_num = 0
    label_num = 0
    global dictionary
    dictionary = corpora.Dictionary.load_from_text("noun_dic.txt")

    for n in ARTICLE_NAME:
        num2 = 0
        f = codecs.open("/Users/Soma/Onedrive/News_Dataset/article" + n + ".txt", "r")
        ARTICLE[n] = f.readlines()
        f.close()
        for j in ARTICLE[n]:
            tmp = dictionary.doc2bow(M.isMecab(j))
            data_train[cate_num][num2] = list(matutils.corpus2dense([tmp], num_terms=len(dictionary)).T[0])
            label_train[label_num] = n
            num2 += 1
            label_num += 1
        cate_num += 1

    ar_num = 0
    for n in data_train:
        for d in n:
            article_data[ar_num] = d
            ar_num = ar_num + 1

    print "\ntrain start!"
    print "please wait..\n"

    global estimator
    estimator = RandomForestClassifier()
    # 学習させる
    estimator.fit(article_data, label_train)

    print ("\n==== 学習データと予測データが一緒の場合 ====")
    print (estimator.score(article_data, label_train))
Exemplo n.º 7
0
ARTICLE_NAME = ["Computer","Entertainment","Sports","Science","Economy","World",'Politics','Society']
ARTICLE = {"Computer":"","Entertainment":"","Sports":"","Science":"","Economy":"","World":"","Politics":"","Society":""}
ret=[]
data_train = [[] for row in range(8)]
train_num = 0
train_sum = 0


for n in ARTICLE_NAME:
    data_range = countline('/Users/Soma/Onedrive/News_Dataset/article'+n+'.txt')
    train_sum += data_range
    for col in range(data_range):
        data_train[train_num].append(None)
    train_num += 1

label_train = [None for col in range(train_sum)]
article_data = [None for col in range(train_sum)]
num =0
num3 =0
dictionary = corpora.Dictionary.load_from_text('test_dic4.txt')

M.pp(M.isMecab("香川ループ弾「衝撃」と賞賛"))
test=[[]for col in range(6)]
test[0].append("a")
test[0].append("b")
test[0].append("c")
test[1].append("d")
test[1].append("e")
print (test)
Exemplo n.º 8
0
        "Sports": "",
        "Science": "",
        "Economy": "",
        "World": "",
        "Politics": "",
        "Society": "",
    }
    dictionary = corpora.Dictionary.load_from_text("noun_dic.txt")

    for n in ARTICLE_NAME:
        print "\n" + n + " LDA modl cleate..\n"
        f = codecs.open("/Users/Soma/Onedrive/News_Dataset/article" + n + ".txt", "r")
        ARTICLE[n] = f.readlines()
        f.close()
        data_train = [dictionary.doc2bow(M.isMecab(j)) for j in ARTICLE[n]]
        tfidf_corpus = gensim.corpora.MmCorpus("news_noun_" + n + ".mm")
        lda = models.LdaModel(corpus=tfidf_corpus, id2word=dictionary, num_topics=30)
        lda.save("model_" + n + ".lda")


if __name__ == "__main__":
    print "cleate_dic.."
    # CD.cleate_dic()
    cleate_lda_model()
    lda = models.LdaModel.load("model_Sports.lda")
    lda2 = models.LdaModel.load("model_Computer.lda")
    for n in range(30):
        M.pp(lda.print_topics(n + 1))
    for n in range(30):
        M.pp(lda2.print_topics(n + 1))
Exemplo n.º 9
0
from sklearn.ensemble import RandomForestClassifier
import simplejson as json
import Test_MeCab as M

text = "iPhone 従来の「カケホーダイプラン」は、国内音声通話が回数の制限なしで無料となるもので、\
月額料金は2年定期契約で2700円だ。それに対してカケホーダイライトプランは月額料金が1700円と1000円分安くなる。\
その条件として5分以内の通話であれば、何回かけても無料だが、5分を超えた場合は30秒あたり20円がかかることとなった。\
ただし「ファミリー割引」に加入していれば家族間通話が無料となるので、家族で申込みする場合は利用したい。\
■データ定額プランは5GBからパケットパックとの組み合わせは、1か月のパケット上限が5GBまでとなる「データMパック」から用意される。\
こちらは月額料金が5000円。インターネット接続サービスの「spモード」300円に加入すれば、利用料金は月額7000円になる。\
ここに携帯電話の購入代金がかかってくる。たとえば『iPhone 6s』16GBモデルをMNPで購入すると、実質負担金は月額432円なので、\
トータル7432円/月で新型iPhoneを使うことができる。「データ通信よりも通話がメインだ」というユーザーなら、\
「カケホーダイプラン」2700円/月に容量2GBの「データSパック」3500円/月を組み合わせる方法もある。\
こちらで『iPhone 6s』16GBモデルをMNPで購入すると、月額6932円となる。"
text2 ="この辺にぃ、おいしいラーメン屋の屋台があるみたいなんですが、行きませんかー?カケホーダイ,iPhone"
ret = M.isMecab(text)
ret2 = M.isMecab(text2)

dictionary = corpora.Dictionary.load_from_text('test_dic.txt')
vec = dictionary.doc2bow(ret)
print(vec)

tmp = dictionary.doc2bow(ret)
dense = list(matutils.corpus2dense([tmp], num_terms=len(dictionary)).T[0])
print(dense)

vec = dictionary.doc2bow(ret2)
print(vec)

tmp = dictionary.doc2bow(ret2)
dense2 = list(matutils.corpus2dense([tmp], num_terms=len(dictionary)).T[0])