def NB_Accuracy(features_train, labels_train, features_test, labels_test): """ 计算分类器的准确率""" ### 导入sklearn模块的GaussianNB from sklearn.naive_bayes import GaussianNB ### 创建分类器 clf = GaussianNB() ### 训练分类器 X = features_train Y = labels_train clf.fit(X, Y) ### 用训练好的分类器去预测测试集的标签值 pred = clf.predict(features_test) print("预测值:") print(pred) ### 计算并返回在测试集上的准确率 from sklearn.metrics import accuracy_score y_pred = pred y_true = labels_test accuracy_score(y_true, y_pred) cm = confusion_matrix(y_true, y_pred) import matplotlib.pyplot as plt class_names = [0, 1] plt.figure() plot_confusion_matrix(cm, classes=class_names, title='Confusion matrix') plt.show() return accuracy_score(y_true, y_pred, normalize=False) #return plt
def plot(class_num, y_test, ans_best): if class_num == 2: classes = ['0', '1'] elif class_num == 3: classes = ['0', '1', '2'] elif class_num == 4: classes = ['0', '1', '2', '3'] np.set_printoptions(precision=2) plot_confusion_matrix(y_test, ans_best, classes=classes, normalize=False, title=None, cmap=plt.cm.Blues) plt.show()
predictions = model_rf.predict(X_test2) results_rf[i].iloc[j] = np.sum(predictions == y_test2) return results_rf.mean(axis=1) #results_rf = determine_depth_tree(X_train,y_train) # Results of the first model model_rf = RandomForestClassifier(n_estimators=500, max_depth=1000, class_weight="balanced") model_rf.fit(X_train, y_train) y_pred = model_rf.predict(X_test) # Plot non-normalized confusion matrix plot_confusion_matrix(y_test, y_pred, classes=["Non-seizure", "Seizure"], title='Confusion matrix, without normalization') # Plot normalized confusion matrix plot_confusion_matrix(y_test, y_pred, classes=["Non-seizure", "Seizure"], normalize=True, title='Normalized confusion matrix') plt.show()
pipeline.fit(train_X, train_y) y_true = test_y y_pred = pipeline.predict(test_X) weighted_f1_score = f1_score(y_true, y_pred, average='weighted') f1_scores = f1_score(y_true, y_pred, average=None, labels=unique_labels) class_f1_scores = dict(zip(unique_labels, f1_scores)) evaluation = { 'weighted_f1_score': weighted_f1_score, 'class_f1_scores': class_f1_scores } filename = 'evaluation-{}.json'.format(current_git_sha()) with open(filename, 'w') as f: json.dump(evaluation, f, sort_keys=True, indent=4, separators=(',', ': ')) compare('evaluation-best.json', filename) if args.confusion_matrix: cm = confusion_matrix(y_true, y_pred, labels=unique_labels) plot_confusion_matrix(cm, unique_labels, normalize=True) plt.show()
def main(args): train_data_count = pd.read_csv('dataset/training/train.csv') val_data_count = pd.read_csv('dataset/training/valid.csv') test_data_count = pd.read_csv('dataset/training/test.csv') print('The count for each label in the training set is:') print(train_data_count['label'].value_counts()) print() print('The count for each label in the validation set is:') print(val_data_count['label'].value_counts()) print() print('The count for each label in the testing set is:') print(test_data_count['label'].value_counts()) print() if args.tokenizer == 'crazy': print('The tokenizer is: CrazyTokenizer \n') tokenizer = CrazyTokenizer().tokenize if args.tokenizer == 'nltk': print('The tokenizer is: NLTK \n') tokenizer = sent_tokenize else: print('The tokenizer is: spacy \n') tokenizer = 'spacy' print('The model used is:', args.model, '\n') text = data.Field(sequential=True, lower=True, include_lengths=True) labels = data.Field(sequential=False, use_vocab=False) train_data, val_data, test_data = data.TabularDataset.splits( path='./dataset', train='./training/train.csv', validation='./training/valid.csv', test='./training/test.csv', format='csv', skip_header=True, fields=[('text', text), ('label', labels)]) train_iter, val_iter, test_iter = data.BucketIterator.splits( (train_data, val_data, test_data), batch_sizes=(args.batch_size, args.batch_size, args.batch_size), sort_key=lambda x: len(x.text), device=None, sort_within_batch=True, repeat=False) text.build_vocab(train_data, val_data, test_data) text.vocab.load_vectors(torchtext.vocab.GloVe(name='6B', dim=100)) vocab = text.vocab print('Shape of Vocab:', text.vocab.vectors.shape, '\n') lr = args.lr num_classes = args.num_class epochs = args.epochs model_type = args.model emb_dim = args.emb_dim rnn_hidden_dim = args.rnn_hidden_dim num_filt = args.num_filt if model_type == 'cnn': net = CNN(emb_dim, vocab, num_filt, [3, 4], num_classes) elif model_type == 'rnn': net = RNN(emb_dim, vocab, rnn_hidden_dim, num_classes) elif model_type == 'gru': net = GRU(emb_dim, vocab, rnn_hidden_dim, num_classes) elif model_type == 'lstm': net = LSTM(emb_dim, vocab, rnn_hidden_dim, num_classes) else: net = Baseline(emb_dim, vocab, num_classes) # Use CUDA model if available: net.to(device) # Setup using Adam optimizer optimizer = optim.Adam(net.parameters(), lr=lr) loss_fcn = nn.CrossEntropyLoss() # Plotting data plot_epoch = [i for i in range(1, args.epochs + 1)] plot_train_loss, plot_train_acc, plot_valid_loss, plot_valid_acc = [], [], [], [] print('---------- TRAINING LOOP ---------- \n') # Begin Training Loop for epoch in range(epochs): cum_loss = 0 for (i, batch) in enumerate(train_iter, 1): # Setting network to training mode net.train() optimizer.zero_grad() # Getting data for current batch batch_input, batch_length = batch.text batch_input = batch_input.to(device) batch_label = nn.functional.one_hot(batch.label).float() # Forward step to get prediction if model_type == 'rnn' or model_type == 'gru' or model_type == 'lstm': output = net(batch_input, batch_length) else: output = net(batch_input) # Loss calculation and parameter update loss = loss_fcn(output, many_cold(batch_label).long().to(device)) cum_loss += loss loss.backward() optimizer.step() # Stats for plotting net.eval() train_loss, train_acc = eval_acc(net, train_iter, loss_fcn, model_type, 'train') valid_loss, valid_acc = eval_acc(net, val_iter, loss_fcn, model_type, 'val') plot_train_loss.append(train_loss / (train_data_count.shape[0])) plot_train_acc.append(train_acc) plot_valid_loss.append(valid_loss / (val_data_count.shape[0])) plot_valid_acc.append(valid_acc) # Print progress per batch to monitor progress print('[%d] Train Loss: %.3f Valid Loss: %.3f Train Acc: %.3f Valid Acc: %3f ' % (epoch + 1, cum_loss / (epoch + 1), valid_loss / (epoch + 1), train_acc, valid_acc)) # Final Results test_loss, test_acc = eval_acc(net, test_iter, loss_fcn, model_type, 'test') val_loss, val_acc = eval_acc(net, val_iter, loss_fcn, model_type, 'valid') train_loss, train_acc = eval_acc(net, train_iter, loss_fcn, model_type, 'train') print() print('---------- FINAL RESULTS ----------') print() print('Final Training Loss: ' + str(train_loss / (epoch + 1)) + ', Final Training Acc: ' + str(train_acc)) print('Final Validation Loss: ' + str(val_loss / (epoch + 1)) + ', Final Validation Acc: ' + str(val_acc)) print('Final Test Loss: ' + str(test_loss / (epoch + 1)) + ', Final Test Acc: ' + str(test_acc)) ''' train_iter, val_iter, test_iter = data.BucketIterator.splits( (train_data, val_data, test_data), batch_sizes=(len(train_data), len(val_data), len(test_data)), sort_key=lambda x: len(x.text), device=None, sort_within_batch=True, repeat=False) for (i, batch) in enumerate(train_iter, 1): # Setting network to eval mode net.eval() # Getting data for current batch batch_input, batch_length = batch.text batch_input = batch_input.to(device) batch_label = nn.functional.one_hot(batch.label).float() # Forward step to get prediction if model_type == 'rnn' or model_type == 'gru': output = net(batch_input, batch_length) else: output = net(batch_input) outputs = many_cold(output) batch_label = many_cold(batch_label) print("Below is Confusion Matrix for Training Set") print(confusion_matrix(batch_label, outputs)) ''' batch_label = torch.empty(0).to(device).float() output = torch.empty(0).to(device) for (i, batch) in enumerate(val_iter, 1): # Getting data for current batch batch_input, batch_length = batch.text batch_input = batch_input.to(device) batch_label = torch.cat((batch_label, batch.label.to(device).float())) # Forward step to get prediction if model_type == 'rnn' or model_type == 'gru' or model_type == 'lstm': output = torch.cat((output, net(batch_input, batch_length))) else: output = torch.cat((output, net(batch_input))) outputs = many_cold(output) # Print number of trainable parameters in the model print() print('The number of trainable parameters in the model is:') print(sum(p.numel() for p in net.parameters() if p.requires_grad)) # https://discuss.pytorch.org/t/how-do-i-check-the-number-of-parameters-of-a-model/4325/7 print() print("Below is Confusion Matrix for Validation Set") print(confusion_matrix(batch_label.cpu(), outputs.cpu())) batch_label = torch.empty(0).to(device).float() output = torch.empty(0).to(device) for (i, batch) in enumerate(test_iter, 1): # Getting data for current batch batch_input, batch_length = batch.text batch_input = batch_input.to(device) batch_label = torch.cat((batch_label, batch.label.to(device).float())) # Forward step to get prediction if model_type == 'rnn' or model_type == 'gru' or model_type == 'lstm': output = torch.cat((output, net(batch_input, batch_length))) else: output = torch.cat((output, net(batch_input))) outputs = many_cold(output) # Saving model if args.save: torch.save(net, 'model_' + model_type + '.pt') # Confusion Matrix print() print("Below is Confusion Matrix for Test Set") plot_confusion_matrix(batch_label.cpu(), outputs.cpu(), classes=subreddits) plt.savefig('model_' + model_type + '_confusion.png') plt.show() # Plot Losses and Accuracy plt.figure() plt.plot(plot_epoch, plot_train_loss, label='Training Loss') plt.plot(plot_epoch, plot_valid_loss, label='Validation Loss') plt.title('Losses as Function of Epoch (' + args.model + ')') plt.ylabel("Loss") plt.xlabel("Epoch") plt.legend() plt.savefig('model_' + model_type + '_loss.png') plt.show() # Plot accuracy plt.figure() plt.plot(plot_epoch, plot_train_acc, label='Training Accuracy') plt.plot(plot_epoch, plot_valid_acc, label='Validation Accuracy') plt.title('Accuracy as Function of Epoch (' + args.model + ')') plt.ylim(0, 1.01) plt.ylabel("Accuracy") plt.xlabel("Epoch") plt.legend() plt.savefig('model_' + model_type + '_accuracy.png') plt.show()
def nn(X_train, X_test, y_train, y_test, class_num, input_dim, epochs, batch_size, optimizer, loss): # Neural network model = Sequential() model.add(Dense(32, input_dim=input_dim, activation='relu')) model.add(Dropout(0.2)) model.add(Dense(16, activation='relu')) model.add(Dropout(0.2)) # model.add(Dense(4, activation='relu')) # model.add(Dropout(0.3)) model.add(Dense(class_num, activation='softmax')) if optimizer == "sgd": sgd = SGD(lr=0.01, decay=1e-6, momentum=0.9, nesterov=True) optimizer_using = sgd elif optimizer == "adam": optimizer_using = "adam" if loss == "binary": loss_using = 'binary_crossentropy' elif loss == "categorical": loss_using = 'categorical_crossentropy' model.compile(loss=loss_using, optimizer=optimizer_using, metrics=['accuracy']) history = model.fit(X_train, y_train, epochs=epochs, batch_size=batch_size) y_pred = model.predict(X_test) pred = list() for i in range(len(y_pred)): pred.append(np.argmax(y_pred[i])) #Converting one hot encoded test label to label test = list() for i in range(len(y_test)): test.append(np.argmax(y_test[i])) from sklearn.metrics import accuracy_score a = accuracy_score(pred, test) print("") print('Accuracy is:', a * 100) print("") print("----------------------") if class_num == 2: classes = ['0', '1'] else: classes = ['0', '1', '2', '3'] np.set_printoptions(precision=2) plot_confusion_matrix(test, pred, classes=classes, normalize=False, title=None, cmap=plt.cm.Blues) plt.show() history = model.fit(X_train, y_train, validation_data=(X_test, y_test), epochs=100, batch_size=64) plt.plot(history.history['acc']) plt.plot(history.history['val_acc']) plt.title('Model accuracy') plt.ylabel('Accuracy') plt.xlabel('Epoch') plt.legend(['Train', 'Test'], loc='upper left') plt.show() plt.plot(history.history['loss']) plt.plot(history.history['val_loss']) plt.title('Model loss') plt.ylabel('Loss') plt.xlabel('Epoch') plt.legend(['Train', 'Test'], loc='upper left') plt.show()
def ml(class_num, epochs, method, source_data, twitter_source, google_source, ig_source, judge=True, nan=True): if judge == True: tmp_data = pie(num=11, source_data=source_data, all_option=True) final_data = combine(tmp_data, twitter_source, google_source, ig_source) input_x, revised_y = data_preprocess(final_data, nan=nan) a = pd.to_datetime(final_data['上映日期']) cut = a.dt.weekofyear test_list = [] train_list = [] test = 0 train = 0 # print(cut) for i in range(len(final_data)): if cut[i] % 4 == 0: test += 1 test_list.append(i) else: train += 1 train_list.append(i) # print(train_list) # print(test_list) print(train) print(test) # print("final",final_data.shape) train_final_data = final_data test_final_data = final_data for i in test_list: train_final_data = train_final_data.drop(final_data.index[i]) # print(len(train_final_data)) train_final_data = train_final_data.reset_index(drop=True) # print(train_new_youtube_file_v3_data) for i in train_list: test_final_data = test_final_data.drop(final_data.index[i]) # print(len(test_new_youtube_file_v3_data)) test_final_data = test_final_data.reset_index(drop=True) # print(test_new_youtube_file_v3_data) X_train, y_train = data_preprocess(train_final_data, nan=nan) X_test, y_test = data_preprocess(test_final_data, nan=nan) if method == "random_forest": y_test, ans_best = random_forest(input_x, revised_y, X_train, y_train, X_test, y_test, judge=judge) elif method == "decision_tree": y_test, ans_best = decision_tree(input_x, revised_y, X_train, y_train, X_test, y_test, judge=judge) else: y_test, ans_best = xgboost(input_x, revised_y, X_train, y_train, X_test, y_test, class_num=class_num, num=epochs, judge=judge) if class_num == 2: classes = ['0', '1'] elif class_num == 4: classes = ['0', '1', '2', '3'] np.set_printoptions(precision=2) plot_confusion_matrix(y_test, ans_best, classes=classes, normalize=False, title=None, cmap=plt.cm.Blues) plt.show() else: tmp_data = pie(num=11, source_data=source_data, all_option=True) final_data = combine(tmp_data, twitter_source, google_source, ig_source) input_x, revised_y = data_preprocess(final_data, nan=nan) if method == "random_forest": y_test, ans_best = random_forest(input_x, revised_y, X_train=0, y_train=0, X_test=0, y_test=0, judge=judge) elif method == "decision_tree": y_test, ans_best = decision_tree(input_x, revised_y, X_train=0, y_train=0, X_test=0, y_test=0, judge=judge) else: y_test, ans_best = xgboost(input_x, revised_y, X_train=0, y_train=0, X_test=0, y_test=0, class_num=class_num, num=epochs, judge=judge) if class_num == 2: classes = ['0', '1'] elif class_num == 4: classes = ['0', '1', '2', '3'] np.set_printoptions(precision=2) plot_confusion_matrix(y_test, ans_best, classes=classes, normalize=False, title=None, cmap=plt.cm.Blues) plt.show()
if acc > best_acc: print('Saving..') torch.save(net.state_dict(), './model.pkl') best_acc = acc best_acc = 0 train_acc = [] test_acc = [] classes = np.array([0, 1, 2, 3, 4]) pred_y = [] truth_y = [] if LOAD: print('Loading model ...') net.load_state_dict(torch.load(Model)) test() else: if CONT: print('Continue training !') net.load_state_dict(torch.load(Model)) for epoch in range(EPOCH): train(epoch) test() plot_confusion_matrix(truth_y, pred_y, classes, True) plt.show() #print(train_acc) #print(test_acc)