def classify( model , train_data , test_data , model_name ) : print "Testing with %s" % model_name X = [ row[ :-1 ] for row in train_data ] y = [ row[ -1 ] for row in train_data ] model.fit( X , y ) X_test = [ row[ :-1 ] for row in test_data ] y_real = [ row[ -1 ] for row in test_data ] y_pred = model.predict( X_test ) print report( y_real , y_pred ) tp = lambda x : 1 if x == 'spam' else 0 real = [ tp( v ) for v in y_real ] pred = [ tp( v ) for v in y_pred ] print mean_absolute_error( real , pred ) print mean_squared_error( real , pred )
def test(config, model, datapath, usegpu): getdata = feature.transformer(config, datapath) trueTag = [] predTag = [] while True: data = getdata.get_data() if data is None: break data0, label0, seqlen = data if usegpu: testdata = Variable(torch.Tensor(data0).cuda()) # label = Variable(torch.LongTensor(label0).cuda()) else: testdata = Variable(torch.Tensor(data0)) # label = Variable(torch.LongTensor(label0)) model.init_hidden(usegpu) output = model.forward(testdata) _, pred = torch.max(output, 2) for i, l in enumerate(label0): trueTag += l predTag += pred[i].data.tolist() print(report(trueTag, predTag))
def _fit(self, seed, test_size, X, y): X_train, X_test, y_train, y_test = train_test_split( X, y, test_size=test_size, random_state=seed ) model_instance = clone(self.model) model_instance.fit(X_train, y_train) y_pred = model_instance.predict(X_test) self.y = y return [ model_instance, report( y_pred, y_test, output_dict=True ) ]
def test_more(self, X_test, y_test, objs): n = int( input( '-- how many parts do you want to split test data into?: - ')) r = input('-- ratios of them (eg: 1:3:6 if n=3): - ').split(':') r = [int(x) for x in r] sum_r = 0 for i in range(n): size = round(r[i] / (10 - sum_r) * len(X_test)) X_test_small, y_test_small, = X_test[:size], y_test[:size] X_test, y_test = X_test[size:], y_test[size:] sum_r += r[i] for obj in objs: y_pred = obj[0].predict(X_test_small) print('--', obj[1]) print(report(y_test_small, y_pred)) """ // if you just want to test with an arbitary amount of test data:
def evaluate(model, features, labels, mask): """Gives accuracy.""" model.eval() with torch.no_grad(): logits = model(features) logits = logits[mask] labels = labels[mask].cpu().numpy() # Statistics _, indices = torch.max(logits, dim=1) prediction = indices.long().cpu().numpy() accuracy = (prediction == labels).sum() / len(prediction) precision, recall, fscore, _ = score( labels, prediction, average="macro" ) class_based_report = report(labels, prediction) return accuracy, precision, recall, fscore, class_based_report
def evaluate(model, features, labels, mask): model.eval() with torch.no_grad(): logits = model(features) logits = logits[mask] labels = labels[mask] _, indices = torch.max(logits, dim=1) correct = torch.sum(indices == labels) # Statistics precision, recall, fscore, support = score(labels, indices) # Accuracy acc = correct.item() * 1.0 / len(labels) class_based_report = report(labels, indices) return acc, precision, recall, fscore, support, class_based_report
def _fit(self, results, seed, X_train, X_test, y_train, y_test): self.model.fit(X_train, y_train) y_pred = self.model.predict(X_test) if self.model_type == "classification": report_dict = report(y_test, y_pred, output_dict=True) elif self.model_type == "regression": report_dict = self.report(y_test, y_pred) else: raise Exception("model_type must be regression or classification") report_dict["mask"] = self._get_mask(y_train, self.data.shape[0]) report_dict["seed"] = seed report_dict["hyperparameters"] = self.hyperparameters if self.is_pipeline(): if 'coef_' in dir(self.model.named_steps['model']): report_dict['coef'] = self.model.named_steps['model'].coef_ else: if "coef_" in dir(self.model): report_dict['coef'] = self.model.coef_ results.append(report_dict) return results
def evaluate(file1, file2): num = 0 wrong = 0 real =[] predicted = [] tags = set() with codecs.open(file1,"r", encoding="iso8859-15") as f1, open(file2,"r",encoding="iso8859-15") as f2: for line1,line2 in zip(f1,f2): if len(line1)> 1 and len(line2) > 1: num += 1 try: word_1,tag_1 = line1.split() word_2,tag_2 = line2.split() real.append(tag_1) tags.add(tag_1) predicted.append(tag_2) if tag_1 != tag_2: wrong += 1 except: pass # print(tag_1,tag_2) try: from sklearn.metrics import classification_report as report print("REPORT" ,report(real,predicted,labels=list(tags))) except ImportError: print("Sk-leanr module not found. skipping classification report") print( "accuracy",(num-wrong)/float(num), "%") #evaluate("data/tiger_test.txt","results.txt")
def f1_scores(results, truth): print(report(truth['class'].tolist(), results['class'].tolist()))
def run(x, y): print(accuracy(x, y)) print(report(x, y)) print(confusion_matrix(x, y)) return 0
('vect', CountVectorizer()), ('tfidf', TfidfTransformer()), ('dense', DenseTransformer()), ('clf', GaussianNB()) ]) bernoulli = Pipeline([ ('vect', CountVectorizer()), ('tfidf', TfidfTransformer()), ('clf', BernoulliNB()) ]) kfold = KFold(n_splits=10) print("\n\n############ MULTINOMIAL STARTED ########################") multinomial_report = [report(classified_target[test], multinomial.fit(classified_data[train], classified_target[train]) .predict(classified_data[test])) for train, test in kfold.split(classified_data)] for rep in multinomial_report: print(rep) print("\n\n############ GAUSSIAN STARTED ########################") gaussian_report = [report(classified_target[test], gaussian.fit(classified_data[train], classified_target[train]) .predict(classified_data[test])) for train, test in kfold.split(classified_data)] for rep in gaussian_report: print(rep) print("\n\n############ BERNOULLI STARTED ########################") bernoulli_report = [report(classified_target[test], bernoulli.fit(classified_data[train], classified_target[train]) .predict(classified_data[test])) for train, test in kfold.split(classified_data)] for rep in bernoulli_report:
# feature selection #df1=SelectPercentile(chi2,percentile=99).fit_transform(df1,label_train) #df2=SelectPercentile(chi2,percentile=99).fit_transform(df2,label_eval) # leveraging of SVM model_svm=svm.SVC(C=1, cache_size=200, class_weight=None, coef0=0.0, decision_function_shape='ovo', degree=1, gamma='scale', kernel='rbf', max_iter=-1, probability=False, random_state=None, shrinking=True, tol=0.001, verbose=False) model_svm.fit(df1,label_train) result_svm=model_svm.predict(df2) report1 = report(label_eval,result_svm,digits=5) print(report1) model_svm=svm.SVC(C=1, cache_size=200, class_weight=None, coef0=0.0, decision_function_shape='ovo', degree=2, gamma='scale', kernel='linear', max_iter=-1, probability=False, random_state=None, shrinking=True, tol=0.001, verbose=False) model_svm.fit(df1,label_train) result_svm=model_svm.predict(df2) report11 = report(label_eval,result_svm,digits=5) print(report11) model_svm=svm.SVC(C=1, cache_size=200, class_weight=None, coef0=0.0,
for i in range(n_updates): model.train(gradient_dataset,whole_train_dataset,i) ### EVALUTATION OF NETWORK ### pred_list =[] labels = [] target_names = ['Anger','Boredom','Disgust', 'Fear', 'Happiness', 'Sadness', 'Neutral'] classes = range(7) for idx,i in enumerate(trX): # Do the prediction for each frame prediction = list(model.predict(i)) # Calculate predominant class pred_list.append(argmax_index([prediction.count(j) for j in range(n_classes)])) labels.append(trY[idx][0]) # Classification report : print report(labels,pred_list,target_names=target_names) #compute accuracy : print "Accuracy : "+str(np.mean(np.asarray(labels) == np.asarray(pred_list))) pred_list = [] labels = [] for idx,i in enumerate(teX): # Do the prediction for each frame prediction = list(model.predict(i)) # Calculate predominant class pred_list.append(argmax_index([prediction.count(j) for j in range(n_classes)])) labels.append(teY[idx][0]) # Classification report : print report(labels,pred_list, target_names=target_names) #compute accuracy : print "Accuracy : "+str(np.mean(np.asarray(labels) == np.asarray(pred_list)))
# In[2]: ###Initial Baseline Models x_train, y_train = load_data("../data/Simulated_Data_Train.csv") x_val, y_val = load_data("../data/Simulated_Data_Validation.csv") x_test, y_test = load_data("../data/Simulated_Data_Test.csv") # In[4]: nn = feed_forward(x_train, y_train, width=32) nn.train(20) print("****** Initial Feed Forward Network *********") print(report(y_test, nn.predict(x_test))) # In[36]: def tune_model_width(build_fn, x_train, y_train, x_val, y_val, max_width=50): """ Takes a 3-Layer nueral network and expands width to see if there are tangible benefits to increasing the width of the hidden layer in the model. Parameters: build_fn - function that returns a keras nn model with the specified parameters x_train - the data matrix y_train - the response function x_val - validation data
def evaluateSpacy(conll_test, max_sent=None, print_dicts=False): nlp = spacy.load('en_core_web_sm') test = loadConll(conll_test) if max_sent is not None and isinstance(max_sent, int): test_doc = list(nlp.pipe(test['text'][:max_sent])) else: test_doc = list(nlp.pipe(test['text'])) # print('Elements in doc format: {}'.format(len(test_doc))) # Retokenization to merge '-' elements (ex: dates, obj-obj) for doc in test_doc: with doc.retokenize() as retokenizer: index = 0 startMerging = -1 for token in doc: if token.whitespace_ == '' and startMerging == -1: startMerging = index if (token.whitespace_ == ' ' or index == len(doc)-1) \ and startMerging != -1: retokenizer.merge(doc[startMerging:index + 1]) startMerging = -1 index += 1 doc_spacy_test_list = [] for doc in test_doc: for token in doc: if token.ent_type_ == '': key = token.ent_iob_ else: key = token.ent_iob_ + '-' + token.ent_type_ doc_spacy_test_list.append(converter(key)) doc_conll_test_list = [] for tag_list in test['NE_tag']: for tag in tag_list.split(): doc_conll_test_list.append(tag) scores = report(doc_conll_test_list, doc_spacy_test_list, output_dict=True, zero_division=0) print('Accuracy on spacy prediction: {:0.4f}\n'.format(scores['accuracy'])) # Chunk accuracy (i.e entity accuracy) sent_idx = 0 ref_list = [] hyp_list = [] for sent in test['text'][:max_sent]: token_idx = 0 ref_token_list = [] hyp_token_list = [] for token in sent.split(): ref_token_list.append( [token, test['NE_tag'][sent_idx].split()[token_idx]]) if test_doc[sent_idx][token_idx].ent_type_ == '': hyp_token_list.append([ test_doc[sent_idx][token_idx].text, test_doc[sent_idx][token_idx].ent_iob_ ]) else: hyp_token_list.append([ test_doc[sent_idx][token_idx].text, test_doc[sent_idx][token_idx].ent_iob_ + '-' + converter(test_doc[sent_idx][token_idx].ent_type_) ]) token_idx += 1 ref_list.append(ref_token_list) hyp_list.append(hyp_token_list) sent_idx += 1 measures = conll.evaluate(ref_list, hyp_list) # Make fancy table: measureShow = pd.DataFrame().from_dict(measures, orient='index') print(measureShow.round(decimals=3))
directory = "F:\\To_server\\model\\test_results\\" #filename = "tagging.test.hyp.txt" #filename = "test.txt" filename = sys.argv[1] with open(directory + filename) as f: prediction = []; label = []; unique_label = {} label_names = [] for line in f: contentlist = line.split() #if (len(contentlist) != 0 and len(contentlist) !=3): #print(line) #print(len(contentlist)) if (len(contentlist) == 0 or contentlist[0] == "BOS" or contentlist[0] == "EOS"): continue else: label.append(contentlist[1]) prediction.append(contentlist[2]) if (contentlist[1] in unique_label): unique_label[contentlist[1]] += 1 else: label_names.append(contentlist[1]) unique_label[contentlist[1]] = 1 print(report(label, prediction, label_names, target_names=label_names))
""" import matplotlib.pyplot as plt from sklearn import datasets, svm, metrics from sklearn.metrics import classification_report as report # format1 = "Classification report for classifier %s:\n%s\n" format2 = "Confusion matrix:\n%s" digits = datasets.load_digits() imageLabels = list(zip(digits.images, digits.target)) for index, (image, label) in enumerate(imageLabels[:4]): plt.subplot(2, 4, index + 1) plt.axis('off') plt.imshow(image, cmap=plt.cm.gray_r, interpolation='nearest') plt.title('Training: %i' % label) n = len(digits.images) data2 = digits.images.reshape((n, -1)) classifier = svm.SVC(gamma=0.001) classifier.fit(data2[:n // 2], digits.target[:n // 2]) expected = digits.target[n // 2:] predicted = classifier.predict(data[n // 2:]) print(format1 % (classifier, report(expected, predicted))) print(format2 % metrics.confusion_matrix(expected, predicted)) imageAndPredictions = list(zip(digits.images[n // 2:], predicted)) for index, (image, prediction) in enumerate(imageAndPredictions[:4]): plt.subplot(2, 4, index + 5) plt.axis('off') plt.imshow(image, cmap=plt.cm.gray_r, interpolation='nearest') plt.title('Prediction: %i' % prediction) plt.show()
import numpy as np from sklearn.metrics import classification_report as report a = np.array([1, 0, 0, 2, 2, 1, 0, 0, 0, 0]) b = np.array([1, 1, 2, 2, 0, 0, 1, 1, 0, 0]) res = np.array([100, 100, -50, -50, 20, -5, 100, 100, -30, -50]) positive = (a == 1) & (b == 1) | (a == 1) & (b == 0) negative = (a == 2) & (b == 2) | (a == 2) & (b == 0) print(report(a, b)) print(res[positive].sum() - res[negative].sum())
def training(self): data_train = pd.read_csv( "D:/Python_Project/Keywords_extraction/train_balance.csv") data_test = pd.read_csv( "D:/Python_Project/Keywords_extraction/test_balance.csv") acc = 0 # cols = [col for col in data_train.columns if col not in ['id', '关键词', '标签']] # cols = [col for col in data_train.columns if col in ['头词频','词频','词长','IDF','出现在标题','首次出现词位置','最后出现词位置','词方差','词平均','词偏度','词峰度','词差方差','最大词差','最小词差','最小句中位置','首次句位置','最后句位置','出现在第一句','出现在最后一句','句子出现频率','句平均','句偏度','包含英文','度中心性','接近中心性','s','f','v','d','k','x','i','l','un','包含数字']] ''' cols=['词频','词长','IDF','出现在标题','首次出现词位置','最后出现词位置','词方差','词偏度','最大句中位置','最小句中位置', '平均句中位置','平均句长','首次句位置','出现在最后一句','句子出现频率','句方差', '句平均','句差方差','最大句差','包含英文','接近中心性','n', 't', 'v', 'z', 'q', 'd', 'k', 'x', 'y', '包含数字'] ['词频', '词长', 'IDF', '出现在标题', '首次出现词位置', '词方差', '词平均', '最大词差', '最大句中位置', '平均句中位置', '首次句位置', '出现在第一句', '出现在最后一句', '句子出现频率', '句方差', '句差方差', '最大句差', '度中心性', 'n', 'v', 'a', 'z', 'd', 'h', 'k', 'x', 'g', 'j', 'y', 'un', '包含数字'] ''' cols = [ '词频', '词长', 'IDF', '出现在标题', '首次出现词位置', '词方差', '词平均', '最大词差', '最大句中位置', '平均句中位置', '首次句位置', '出现在第一句', '出现在最后一句', '句子出现频率', '句方差', '句差方差', '最大句差', '度中心性', 'n', 'v', 'a', 'z', 'd', 'h', 'k', 'x', 'g', 'j', 'y', 'un', '包含数字' ] # cols = [col for col in data_train.columns if col not in ['id', '关键词', '标签']] x_train = data_train.loc[:, cols] y_train = data_train.loc[:, '标签'] x_train = x_train.reset_index(drop=True) y_train = y_train.reset_index(drop=True) x_val = data_test.loc[:, cols] y_val = data_test.loc[:, '标签'] x_val = x_val.reset_index(drop=True) y_val = y_val.reset_index(drop=True) # 测试集为30%,训练集为70% # x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.1, random_state=0) lgb_train = lgb.Dataset(x_train, y_train) lgb_eval = lgb.Dataset(x_val, y_val, reference=lgb_train) # print('开始训练......') params = { 'task': 'train', 'boosting_type': 'gbdt', 'objective': 'binary', 'metric': {'auc'}, 'learning_rate': 0.025, 'num_leaves': 100, 'min_data_in_leaf': 70, 'bagging_fraction': 0.85, 'is_unbalance': 'true', 'seed': 42 } gbm = lgb.train( params, lgb_train, num_boost_round=5000, valid_sets=lgb_eval, early_stopping_rounds=30, verbose_eval=False, ) y_pred = gbm.predict(x_val) y_pred = list(y_pred) Y_val = list(y_val) pos = 0 pos_acc = 0 pos_pre = 0 for i, j in zip(Y_val, y_pred): if (i >= 0.5): pos += 1 if (i >= 0.5 and j >= 0.5): pos_acc += 1 if (j >= 0.5): pos_pre += 1 pos_r = pos_acc / pos pos_a = pos_acc / pos_pre print((pos_a * pos_r) / (pos_a + pos_r) * 2) i = 0 count = 0 for item in y_pred: if item > 0.5: y_pred[i] = 1 else: y_pred[i] = 0 i = i + 1 # print(report(Y_val, y_pred,digits=4)) y_pred = gbm.predict(x_train) y_pred = list(y_pred) Y_train = list(y_train) i = 0 count = 0 for item in y_pred: if item > 0.5: y_pred[i] = 1 else: y_pred[i] = 0 i = i + 1 print(report(Y_train, y_pred, digits=4)) plt.rc('font', family='SimSun', size=13) # gbm.save_model('lgbmodel_allfeature.model') explainer = shap.TreeExplainer(gbm) shap_values = explainer.shap_values(x_train) # 基线值y_base就是训练集的目标变量的拟合值的均值。 y_base = explainer.expected_value shap.initjs() # shap.summary_plot(shap_values[0], x_train, sort=True, color_bar_label=("FEATURE_VALUE0"))#1 shap.summary_plot(shap_values[1], x_train, sort=True, color_bar_label=("FEATURE_VALUE1")) # 2
def train(configpath, datapath, classes, usegpu): config = configparser.ConfigParser() config.read(configpath) getdata = feature.transformer(config, datapath) print('building net...') net = model.LSTM(config, classes) if usegpu: net = net.cuda() optimer = optim.Adam(net.parameters(), lr=config.getfloat('train', 'learning_rate')) criterion = nn.CrossEntropyLoss() print('begin training ...') for epoch in range(config.getint('train', 'epoch')): print('epoch:', epoch) trueTag = [] predTag = [] while True: data = getdata.get_data() if data is None: break traindata, label0, seqlen = data if usegpu: traindata = Variable(torch.Tensor(traindata).cuda()) label = Variable(torch.LongTensor(label0).cuda()) else: traindata = Variable(torch.Tensor(traindata)) label = Variable(torch.LongTensor(label0)) net.init_hidden(usegpu) output = net.forward(traindata) # print(output.size()) loss = 0 _, pred = torch.max(output, 2) for i, l in enumerate(label0): trueTag += l predTag += pred[i].data.tolist() # print(output.size()) for i, seq in enumerate(output): # print(seq.size()) # print(label[i]) # print(seq.size()) for j, l in enumerate(label[i]): # print(seq[j]) if l.data.tolist()[0] == 0: loss += 0.05 * criterion(seq[j].view(1, -1), l) else: loss += criterion(seq[j].view(1, -1), l) # print(loss) optimer.zero_grad() loss.backward() optimer.step() print('train result') # print(trueTag) # print(predTag) print(report(trueTag, predTag)) print('test result') test(config, net, '/Users/Smart/Desktop/code/Challenge_Cup/test1.json', usegpu)
def main(trainpaths, testpath): #Load training data in train_dict = collect_train_data(trainpaths) print('Training data loaded successfully') #Load test data in test_data = collect_test_data(testpath) print('Test data loaded successfully') #Collect labels labels = collect_labels(train_dict) label_dict = create_label_dict(train_dict) inv_label_dict = {value: key for key, value in label_dict.items()} #Train Vectorizer v = DictVectorizer(sparse=False) train_features_list = [] for chord_prog in train_dict.keys(): for note_list in train_dict[chord_prog]: n = 4 D = { 'max_avg_ngrams': avg_max_pitch(note_list, n), 'min_avg_ngrams': avg_min_pitch(note_list, n), 'num_notes': len(note_list), 'num_max_pitch': num_max_pitch(note_list, n), 'num_min_pitch': num_min_pitch(note_list, n), 'avg_pitch': avg_pitch(note_list), 'max_pitch_diff': max_pitch_diff(note_list), 'max_diff_avg_ngrams': max_pitch_diff_avg(note_list, n), 'most_freq_pitch': most_common_pitch(note_list), 'freq_pitch_diff_avg': normalized_pitch_diff_avg(note_list) } train_features_list.append(D) x_train = v.fit_transform(train_features_list) print('Train Features Step Complete') #Vectorize Test data for later use test_features_list = [] for note_list in test_data: n = 4 D = { 'max_avg_ngrams': avg_max_pitch(note_list, n), 'min_avg_ngrams': avg_min_pitch(note_list, n), 'num_notes': len(note_list), 'num_max_pitch': num_max_pitch(note_list, n), 'num_min_pitch': num_min_pitch(note_list, n), 'avg_pitch': avg_pitch(note_list), 'max_pitch_diff': max_pitch_diff(note_list), 'max_diff_avg_ngrams': max_pitch_diff_avg(note_list, n), 'most_freq_pitch': most_common_pitch(note_list), 'freq_pitch_diff_avg': normalized_pitch_diff_avg(note_list) } test_features_list.append(D) x_test = v.transform(test_features_list) print('Test Features Step Complete') #Train Classifer K = knn(n_neighbors=5) y_train = [label_dict[label] for label in labels] K = K.fit(x_train, y_train) print('KNN Classifier Training Step Complete') #For report later on y_pred = [] #Predict chord progression using KNN for x in x_test: x_predict = [] x_predict.append(x) predict = K.predict(x_predict) print(inv_label_dict[predict[0]]) y_pred.append(predict[0]) #Find Precision, Recall, F-1 scores for test data over chord progressions target_names = [prog for prog in label_dict.keys()] #Hardcoded for test data y_true = [0, 0, 1, 2, 3, 4] #Print Report print(report(y_true, y_pred, target_names=target_names))
], axis=1) # Build model rf_cv = RandomForestClassifier(n_estimators=300, max_depth=90, n_jobs=-1) rf_model_cv = rf_cv.fit(X_cv_train, y_train) y_prediction_cv = rf_model_cv.predict(X_cv_test) precision, recall, fscore, train_support = f_score(y_test, y_prediction_cv, pos_label='spam', average='micro') print('Precision: {} --- Recall: {} --- F1-Score: {} --- Accuracy: {}'.format( round(precision, 3), round(recall, 3), round(fscore, 3), round(accuracy(y_test, y_prediction_cv), 3))) print(report(y_test, y_prediction_cv)) # ------------------------------------------------------------------------------------ # Making the Confusion Matrix: CountVectorizer matrixcv = confusion_matrix(y_test, y_prediction_cv) class_label = ['0', '1', '2', '3', '4'] matrixcv_df = pd.DataFrame(matrixcv, index=class_label, columns=class_label) sns.heatmap(matrixcv_df, annot=True, fmt='d') plt.title("Confusion Matrix of best CountVectorizer model") plt.xlabel("Predicted Label") plt.ylabel("True Label") plt.show() # ------------------------------------------------------------------------------------ # Evaluation of model: on tfidfVectorizer # num_estimator = 300, max_depth = None # Variables define tfidf = tfidfV(ngram_range=(2, 2), analyzer=cleandata) # defined before
label_train = getLabel([], 'train-labels.txt') label_eval = getLabel([], 'eval-labels.txt') x_vectorizer = HashingVectorizer() corpus1 = [] corpus2 = [] with open('train-tweets.txt', encoding='utf-8') as train: for line in train: line = line.replace("\n", "").split("\t") corpus1.append(line[1]) with open('eval-tweets.txt', encoding='utf-8') as train: for line in train: line = line.replace("\n", "").split("\t") corpus2.append(line[1]) # print(corpus) X = x_vectorizer.fit_transform(corpus1) #X.toarray() y_vectorizer = HashingVectorizer() Y = y_vectorizer.fit_transform(corpus2) #print(x_vectorizer.get_feature_names()) model_NB = MultinomialNB() model_NB.fit(X, label_train) result_NB = model_NB.predict(Y) report3 = report(label_eval, result_NB, digits=5) print('\n', report3)
# In[2]: x_train, y_train = load_data("../data/Simulated_Data_Train.csv") x_val, y_val = load_data("../data/Simulated_Data_Validation.csv") x_test, y_test = load_data("../data/Simulated_Data_Test.csv") # In[6]: lr = log_reg(x_train, y_train) yprob = lr.predict(x_test) yhat = decide(yprob, 0.5) print(report(y_test, lr.model.predict(x_test))) # In[19]: credit_data = load_data("../data/Simulated_Data_Test.csv", as_df = True) coef_dict = {} for var, coef in zip(credit_data.columns, lr.model.coef_[0]): coef_dict[var] = coef coef_frame = pd.DataFrame.from_dict(coef_dict, 'index', columns = ["coefficient"]) coef_frame.to_latex("../report/coef.tex")