#参考サイト #https://qiita.com/ground0state/items/155b77f4c07e1a509a14 import pandas as pd import re from sklearn.model_selection import train_test_split from sklearn.feature_extraction.text import TfidfVectorizer import pickle from sklearn.metrics import accuracy_score from q51_2 import load_data #X_list:記事見出しのリスト #Y_list:カテゴリ名のリスト #def my_logistic_regression(X,Y,model): lr=pickle.load(open("my_lr.model", 'rb')) train_feature=load_data("train.feature.txt") test_feature=load_data("test.feature.txt") train=load_data("train.txt") test=load_data("test.txt") #学習データの取得 #train_df=pd.read_table("test.feature.txt", # header=None, # sep="\t", # encoding="UTF-8") #X_train=train_df.drop(train_df.columns[[len(train_df.columns)-1]], axis=1)#特徴量の取得 #Y_test=train_df[len(train_df.columns)-1]#ラベルの取得 pred_train=lr.predict(train_feature) pred_test=lr.predict(test_feature) #print(pred_train) #print(Y_train)
# coding: utf-8 # In[3]: #https://zenn.dev/yagiyuki/articles/0d6f97028fdd40209b7f #https://qiita.com/FujiedaTaro/items/5784eda386146f1fd6e7 import pandas as pd from sklearn.linear_model import LogisticRegression from sklearn.metrics import accuracy_score import pickle import numpy as np from sklearn.model_selection import RandomizedSearchCV from q51_2 import load_data #データをロード X_train = load_data("train.feature.txt") X_valid = load_data("valid.feature.txt") X_test = load_data("test.feature.txt") Y_train = load_data("train.txt")["CATEGORY"] Y_valid = load_data("valid.txt")["CATEGORY"] Y_test = load_data("test.txt")["CATEGORY"] hyper_param = np.logspace(-3, 3, num=7) best_param = 0 best_accuracy = 0 for c in hyper_param: #10^(-3)~10^3までハイパーパラメータを変更 lr = LogisticRegression(max_iter=1000, C=c) #ハイパーパラメータを指定して、インスタンスを作成 lr.fit(X_train, Y_train) #重みを学習 pred_valid = lr.predict(X_valid)
#!/usr/bin/env python # coding: utf-8 # In[1]: #参考サイト #https://qiita.com/ground0state/items/155b77f4c07e1a509a14 import pandas as pd import re from sklearn.model_selection import train_test_split from sklearn.feature_extraction.text import TfidfVectorizer from q51_2 import load_data import pickle #X_list:記事見出しのリスト #Y_list:カテゴリ名のリスト #def my_logistic_regression(X,Y,model): lr = pickle.load(open("my_lr.model", 'rb')) test_feature = load_data("test.feature.txt") test = load_data("test.txt") Y_pred = lr.predict(test_feature) Y_pred_proba = lr.predict_proba(test_feature) #クラス0~3に属する確率のリスト print(Y_pred) print(Y_pred_proba) # In[ ]:
#!/usr/bin/env python # coding: utf-8 # In[2]: import pandas as pd from sklearn.linear_model import LogisticRegression from q51_2 import load_data import pickle train_feature = load_data("train.feature.txt") train = load_data("train.txt") #print(train_df) #print(len(train_df.columns)) #print(train_df[len(train_df.columns)-1]) #X_train=train_df[0:len(train_df.columns)-2]#カテゴリ名を取得 #print(X_train) #print(train_df.iloc[train_df.columns-1:train_df.columns]) #print(train_feature) #print(train) lr = LogisticRegression(max_iter=1000) #インスタンスを作成、デフォルトで収束しなかったため1000とした lr.fit(train_feature, train["CATEGORY"]) #重みを学習 #モデルをシリアライズして保存 filename = "my_lr.model" pickle.dump(lr, open(filename, 'wb')) # In[ ]:
import pickle from sklearn.metrics import confusion_matrix import seaborn as sns import matplotlib.pyplot as plt from sklearn.metrics import classification_report from q51_2 import load_data #X_list:記事見出しのリスト #Y_list:カテゴリ名のリスト #def my_logistic_regression(X,Y,model): lr=pickle.load(open("my_lr.model", 'rb')) #df=pd.read_table("tmp.txt",#評価データにヘッダを付けたもの # sep="\t", # encoding="UTF-8") #print(df.columns) df=load_data("test.feature.txt") #クラス名と特徴量を取得 for cl,coef in zip(lr.classes_,lr.coef_): # print(len(coef)) # print(len(df.columns)) # print(coef) sorted_index=coef.argsort() print(f"class:{cl}") for i in range (0,10): print(f"下位{i+1}:"+df.columns[sorted_index[i]]+f",{coef[sorted_index[i]]}") print() for i in range (0,10): print(f"上位{i+1}:"+df.columns[sorted_index[-i-1]]+f",{coef[sorted_index[-i-1]]}") print()