def predict_titles(titles): lr_model, data, vectorizer = train() print(lr_model.predict(vectorizer.transform(titles).toarray())) print([ max(i) for i in lr_model.predict_proba( vectorizer.transform(titles).toarray()) ])
def knock57(): lr, data, vectorizer = train() inverse_vectorizer_vocabulary_ = { v: k for k, v in vectorizer.vocabulary_.items() } for cnt, class_name in enumerate(lr.classes_): lr.coef_[cnt] print(class_name) for i in heapq.nlargest(10, lr.coef_[cnt]): index1 = np.where(lr.coef_[cnt] == i) print(inverse_vectorizer_vocabulary_[index1[0][0]], ":", i) for i in heapq.nsmallest(10, lr.coef_[cnt]): index1 = np.where(lr.coef_[cnt] == i) print(inverse_vectorizer_vocabulary_[index1[0][0]], ":", i) print()
def knock56(): lr, data, vectorizer = train() # 一行目がprecision 、二行目がrecall 、、、 # 一列目がカテゴリ1、二行目がカテゴリ2 、、、 pprint( precision_recall_fscore_support(y_true=data[1][1], y_pred=lr.predict(data[1][0]))) # precision, recall, f-score, support の順番 print( "macro:", precision_recall_fscore_support(y_true=data[1][1], y_pred=lr.predict(data[1][0]), average="macro")) print( "micro:", precision_recall_fscore_support(y_true=data[1][1], y_pred=lr.predict(data[1][0]), average="micro"))
def knock55(): lr, data, vectorizer = train() print(confusion_matrix(y_true=data[0][1], y_pred=lr.predict(data[0][0]))) print(confusion_matrix(y_true=data[1][1], y_pred=lr.predict(data[1][0])))
valid = open('valid.feature.txt') valid_ftr, valid_label = read_data(valid) test = open('test.feature.txt') test_ftr, test_label = read_data(test) vectorizer = vectorize(train_ftr) joblib.dump(vectorizer, 'vectorizer.pkl') x_train = vectorizer.transform(train_ftr) y_train = train_label regularization = [] for c in range(-5,5): # train with regularization model_name = 'model_reg_10**' + str(c) + '.pkl' train(x_train, y_train, model_name, c) reg = '10e' + str(c) # calculate accuracy on train, valid, test acc_train = accuracy(train_ftr, train_label, model_name,'vectorizer.pkl') acc_valid = accuracy(valid_ftr, valid_label, model_name,'vectorizer.pkl') acc_test= accuracy(test_ftr, test_label, model_name,'vectorizer.pkl') regularization.append(reg + '\t' + str(round(acc_train,6)) + '\t' + str(round(acc_valid,6)) + '\t' + str(round(acc_test,6))) reg_val, acc_tr, acc_val, acc_ts = [],[],[],[] for i in regularization: temp = i.strip().split('\t') reg_val.append(float(temp[0])) acc_tr.append(float(temp[1])) acc_val.append(float(temp[2]))
def knock54(): lr, data, vectorizer = train() print("train", lr.score(data[0][0], data[0][1])) print("vaild", lr.score(data[1][0], data[1][1]))