def compute_best_svm_radius(train_matrix, train_labels, val_matrix, val_labels, radius_to_consider): """Compute the optimal SVM radius using the provided training and evaluation datasets. You should only consider radius values within the radius_to_consider list. You should use accuracy as a metric for comparing the different radius values. Args: train_matrix: The word counts for the training data train_labels: The spma or not spam labels for the training data val_matrix: The word counts for the validation data val_labels: The spam or not spam labels for the validation data radius_to_consider: The radius values to consider Returns: The best radius which maximizes SVM accuracy. """ # *** START CODE HERE *** radius_to_consider best_rad = 0.0 best_acc = 0.0 for radius in radius_to_consider: val_preds = svm.train_and_predict_svm(train_matrix, train_labels, val_matrix, radius) acc = float(np.sum(val_preds == val_labels)) / val_preds.shape[0] print(radius, acc) if (acc > best_acc): best_acc = acc best_rad = radius return best_rad
def compute_best_svm_radius(train_matrix, train_labels, val_matrix, val_labels, radius_to_consider): """Compute the optimal SVM radius using the provided training and evaluation datasets. You should only consider radius values within the radius_to_consider list. You should use accuracy as a metric for comparing the different radius values. Args: train_matrix: The word counts for the training data train_labels: The spma or not spam labels for the training data val_matrix: The word counts for the validation data val_labels: The spam or not spam labels for the validation data radius_to_consider: The radius values to consider Returns: The best radius which maximizes SVM accuracy. """ # *** START CODE HERE *** top_accuracy = 0 top_radi = 0 for radi in radius_to_consider: cur_preds = svm.train_and_predict_svm(train_matrix, train_labels, val_matrix, radi) acc = np.mean(cur_preds == val_labels) if acc >= top_accuracy: top_radi = radi return top_radi
def compute_best_svm_radius(train_matrix, train_labels, val_matrix, val_labels, radius_to_consider): """Compute the optimal SVM radius using the provided training and evaluation datasets. You should only consider radius values within the radius_to_consider list. You should use accuracy as a metric for comparing the different radius values. Args: train_matrix: The word counts for the training data train_labels: The spma or not spam labels for the training data val_matrix: The word counts for the validation data val_labels: The spam or not spam labels for the validation data radius_to_consider: The radius values to consider Returns: The best radius which maximizes SVM accuracy. """ # *** START CODE HERE *** best_radius = None for radius_value in radius_to_consider: svm_predictions = svm.train_and_predict_svm(train_matrix, train_labels, val_matrix, radius_value) if best_radius is None: best_radius = (np.mean(svm_predictions == val_labels), radius_value) else: best_radius = max(best_radius, (np.mean(svm_predictions == val_labels), radius_value)) return best_radius[1]
def main(): train_tweets, val_tweets, test_tweets, train_labels, val_labels, test_labels = load_dataset("final_data/compiled_data.csv") dictionary = create_dictionary(train_tweets) util.write_json('./output/dictionary', dictionary) train_matrix = transform_text(train_tweets, dictionary) val_matrix = transform_text(val_tweets, dictionary) test_matrix = transform_text(test_tweets, dictionary) naive_bayes_model = fit_naive_bayes_model(train_matrix, train_labels) naive_bayes_predictions = predict_from_naive_bayes_model(naive_bayes_model, test_matrix) naive_bayes_accuracy = np.mean(naive_bayes_predictions == test_labels) print("naive_bayes_results: ") unique, counts = np.unique(naive_bayes_predictions, return_counts=True) print(dict(zip(unique, counts))) print("test_labels: " ) unique, counts = np.unique(test_labels, return_counts=True) print(dict(zip(unique, counts))) print('Naive Bayes had an accuracy of {} on the testing set'.format(naive_bayes_accuracy)) top_5_words = get_top_five_naive_bayes_words(naive_bayes_model, dictionary) print('The top 5 indicative words for Naive Bayes are: ', top_5_words) optimal_radius = compute_best_svm_radius(train_matrix, train_labels, val_matrix, val_labels, [0.01, 0.1, 1, 10]) util.write_json('./output/p06_optimal_radius', optimal_radius) print('The optimal SVM radius was {}'.format(optimal_radius)) svm_predictions = svm.train_and_predict_svm(train_matrix, train_labels, test_matrix, optimal_radius) svm_accuracy = np.mean(svm_predictions == test_labels) print('The SVM model had an accuracy of {} on the testing set'.format(svm_accuracy, optimal_radius))
def main(): train_messages, train_labels = util.load_spam_dataset('spam_train.tsv') val_messages, val_labels = util.load_spam_dataset('spam_val.tsv') test_messages, test_labels = util.load_spam_dataset('spam_test.tsv') dictionary = create_dictionary(train_messages) print('Size of dictionary: ', len(dictionary)) train_matrix = transform_text(train_messages, dictionary) val_matrix = transform_text(val_messages, dictionary) test_matrix = transform_text(test_messages, dictionary) optimal_radius = compute_best_svm_radius(train_matrix, train_labels, val_matrix, val_labels, [0.01, 0.1, 1, 10]) print('The optimal SVM radius was {}'.format(optimal_radius)) svm_predictions = svm.train_and_predict_svm(train_matrix, train_labels, test_matrix, optimal_radius) np.savetxt('svm_predictions.txt', svm_predictions) svm_accuracy = np.mean(svm_predictions == test_labels) print('The SVM model had an accuracy of {} on the testing set'.format( svm_accuracy))
def compute_best_svm_radius(train_matrix, train_labels, val_matrix, val_labels, radius_to_consider): """Compute the optimal SVM radius using the provided training and evaluation datasets. You should only consider radius values within the radius_to_consider list. You should use accuracy as a metric for comparing the different radius values. Args: train_matrix: The word counts for the training data train_labels: The spma or not spam labels for the training data val_matrix: The word counts for the validation data val_labels: The spam or not spam labels for the validation data radius_to_consider: The radius values to consider Returns: The best radius which maximizes SVM accuracy. """ SVM_radio = (-1.0, -1.0) for radio in radius_to_consider: svm_predictions = svm.train_and_predict_svm(train_matrix, train_labels, val_matrix, radio) svm_accuracy = np.mean(svm_predictions == val_labels) if svm_accuracy > SVM_radio[1]: SVM_radio = (radio, svm_accuracy) return SVM_radio[0]
def compute_best_svm_radius(train_matrix, train_labels, val_matrix, val_labels, radius_to_consider): """Compute the optimal SVM radius using the provided training and evaluation datasets. You should only consider radius values within the radius_to_consider list. You should use accuracy as a metric for comparing the different radius values. Args: train_matrix: The word counts for the training data train_labels: The spam or not spam labels for the training data eval_matrix: The word counts for the validation data eval_labels: The spam or not spam labels for the validation data radius_to_consider: The radius values to consider Returns: The best radius which maximizes SVM accuracy. """ # *** START CODE HERE *** accuracy_old = 0 rad_max = radius_to_consider[0] for rad in radius_to_consider: pred = svm.train_and_predict_svm(train_matrix, train_labels, val_matrix, rad) accuracy = np.mean(pred == val_labels) if accuracy > accuracy_old: accuracy_old = accuracy rad_max = rad return rad_max
def compute_best_svm_radius(train_matrix, train_labels, val_matrix, val_labels, radius_to_consider): """Compute the optimal SVM radius using the provided training and evaluation datasets. You should only consider radius values within the radius_to_consider list. You should use accuracy as a metric for comparing the different radius values. Args: train_matrix: The word counts for the training data train_labels: The spma or not spam labels for the training data val_matrix: The word counts for the validation data val_labels: The spam or not spam labels for the validation data radius_to_consider: The radius values to consider Returns: The best radius which maximizes SVM accuracy. """ # *** START CODE HERE *** outputs = {} i = 0 for radius in radius_to_consider: output = svm.train_and_predict_svm(train_matrix, train_labels, val_matrix, radius) accuracy = np.mean(output == val_labels) outputs[radius] = accuracy best_radius = max(outputs, key=outputs.get) return best_radius
def compute_best_svm_radius(train_matrix, train_labels, val_matrix, val_labels, radius_to_consider): """Compute the optimal SVM radius using the provided training and evaluation datasets. You should only consider radius values within the radius_to_consider list. You should use accuracy as a metric for comparing the different radius values. Args: train_matrix: The word counts for the training data train_labels: The spma or not spam labels for the training data eval_matrix: The word counts for the validation data eval_labels: The spam or not spam labels for the validation data radius_to_consider: The radius values to consider Returns: The best radius which maximizes SVM accuracy. """ # *** START CODE HERE *** best_radius, best_acc = 0.0, 0.0 for radius in radius_to_consider: pred = svm.train_and_predict_svm(train_matrix, train_labels, val_matrix, radius) acc = (pred == val_labels).sum() / len(pred) if acc > best_acc: best_radius = radius best_acc = acc return best_radius
def compute_best_svm_radius(train_matrix, train_labels, val_matrix, val_labels, radius_to_consider): """Compute the optimal SVM radius using the provided training and evaluation datasets. You should only consider radius values within the radius_to_consider list. You should use accuracy as a metric for comparing the different radius values. Args: train_matrix: The word counts for the training data train_labels: The spma or not spam labels for the training data val_matrix: The word counts for the validation data val_labels: The spam or not spam labels for the validation data radius_to_consider: The radius values to consider Returns: The best radius which maximizes SVM accuracy. """ # *** START CODE HERE *** accuracy_list = [] for radius in radius_to_consider: pred_y = svm.train_and_predict_svm(train_matrix, train_labels, val_matrix, radius) accuracy_list.append(np.mean(val_labels == pred_y)) idx = np.argmax(accuracy_list) return radius_to_consider[idx]
def compute_best_svm_radius(train_matrix, train_labels, val_matrix, val_labels, radius_to_consider): """Compute the optimal SVM radius using the provided training and evaluation datasets. You should only consider radius values within the radius_to_consider list. You should use accuracy as a metric for comparing the different radius values. Args: train_matrix: The word counts for the training data train_labels: The spma or not spam labels for the training data val_matrix: The word counts for the validation data val_labels: The spam or not spam labels for the validation data radius_to_consider: The radius values to consider Returns: The best radius which maximizes SVM accuracy. """ best_rad = None best_acc = 0 for rad in radius_to_consider: valid_preds = svm.train_and_predict_svm(train_matrix, train_labels, val_matrix, rad) valid_acc = np.mean(valid_preds == val_labels) if valid_acc > best_acc: best_rad = rad best_acc = valid_acc return best_rad
def compute_best_svm_radius(train_matrix, train_labels, val_matrix, val_labels, radius_to_consider): """Compute the optimal SVM radius using the provided training and evaluation datasets. You should only consider radius values within the radius_to_consider list. You should use accuracy as a metric for comparing the different radius values. Args: train_matrix: The word counts for the training data train_labels: The spma or not spam labels for the training data eval_matrix: The word counts for the validation data eval_labels: The spam or not spam labels for the validation data radius_to_consider: The radius values to consider Returns: The best radius which maximizes SVM accuracy. """ opt_radius = 0 accuracy = 0 for i in range(len(radius_to_consider)): output = svm.train_and_predict_svm(train_matrix, train_labels, val_matrix, radius_to_consider[i]) correct = 0 for j in range(len(val_labels)): if val_labels[j] == output[j]: correct += 1 if accuracy < (correct / len(val_labels)): opt_radius = radius_to_consider[i] accuracy = correct / len(val_labels) return opt_radius
def main(): train_messages, train_labels = util.load_spam_dataset('spam_train.tsv') val_messages, val_labels = util.load_spam_dataset('spam_val.tsv') test_messages, test_labels = util.load_spam_dataset('spam_test.tsv') ### Q3.1 ### dictionary = create_dictionary(train_messages) print('Size of dictionary: ', len(dictionary)) util.write_json('spam_dictionary', dictionary) train_matrix = transform_text(train_messages, dictionary) np.savetxt('spam_sample_train_matrix', train_matrix[:100, :]) val_matrix = transform_text(val_messages, dictionary) test_matrix = transform_text(test_messages, dictionary) ### Q3.2 ### naive_bayes_model = fit_naive_bayes_model(train_matrix, train_labels) naive_bayes_predictions = predict_from_naive_bayes_model( naive_bayes_model, test_matrix) np.savetxt('spam_naive_bayes_predictions', naive_bayes_predictions) naive_bayes_accuracy = np.mean(naive_bayes_predictions == test_labels) print('Naive Bayes had an accuracy of {} on the testing set'.format( naive_bayes_accuracy)) ### Q3.3 ### top_5_words = get_top_five_naive_bayes_words(naive_bayes_model, dictionary) print('The top 5 indicative words for Naive Bayes are: ', top_5_words) util.write_json('spam_top_indicative_words', top_5_words) ### Q3.4 ### optimal_radius = compute_best_svm_radius(train_matrix, train_labels, val_matrix, val_labels, [0.01, 0.1, 1, 10]) # optimal_radius = compute_best_svm_radius(train_matrix, train_labels, val_matrix, val_labels, [0.1]) util.write_json('spam_optimal_radius', optimal_radius) print('The optimal SVM radius was {}'.format(optimal_radius)) svm_predictions = svm.train_and_predict_svm(train_matrix, train_labels, test_matrix, optimal_radius) svm_accuracy = np.mean(svm_predictions == test_labels) print('The SVM model had an accuracy of {} on the testing set'.format( svm_accuracy, optimal_radius))
def main(): train_messages, train_labels = util.load_spam_dataset( '../data/ds6_train.tsv') val_messages, val_labels = util.load_spam_dataset('../data/ds6_val.tsv') test_messages, test_labels = util.load_spam_dataset('../data/ds6_test.tsv') dictionary = create_dictionary(train_messages) util.write_json('./output/p06_dictionary', dictionary) train_matrix = transform_text(train_messages, dictionary) np.savetxt('./output/p06_sample_train_matrix', train_matrix[:100, :]) val_matrix = transform_text(val_messages, dictionary) test_matrix = transform_text(test_messages, dictionary) naive_bayes_model = fit_naive_bayes_model(train_matrix, train_labels) naive_bayes_predictions = predict_from_naive_bayes_model( naive_bayes_model, test_matrix) np.savetxt('./output/p06_naive_bayes_predictions', naive_bayes_predictions) naive_bayes_accuracy = np.mean(naive_bayes_predictions == test_labels) print('Naive Bayes had an accuracy of {} on the testing set'.format( naive_bayes_accuracy)) top_5_words = get_top_five_naive_bayes_words(naive_bayes_model, dictionary) print('The top 5 indicative words for Naive Bayes are: ', top_5_words) util.write_json('./output/p06_top_indicative_words', top_5_words) optimal_radius = compute_best_svm_radius(train_matrix, train_labels, val_matrix, val_labels, [0.01, 0.1, 1, 10]) util.write_json('./output/p06_optimal_radius', optimal_radius) print('The optimal SVM radius was {}'.format(optimal_radius)) svm_predictions = svm.train_and_predict_svm(train_matrix, train_labels, test_matrix, optimal_radius) svm_accuracy = np.mean(svm_predictions == test_labels) print('The SVM model had an accuracy of {} on the testing set'.format( svm_accuracy, optimal_radius)) end = time.time() print("Execution Time: ", end - start)
def experimenting_without_punctuation(): train_messages, train_labels = util.load_spam_dataset('spam_train.tsv') val_messages, val_labels = util.load_spam_dataset('spam_val.tsv') test_messages, test_labels = util.load_spam_dataset('spam_test.tsv') print( 'EXPERIMENT: WHAT HAPPENS WHEN WE DELETE PUNCTUATION FROM OUR MESSAGES' ) ### Q3.1 ### dictionary = create_dictionary(train_messages) print('Size of dictionary: ', len(dictionary)) train_matrix = transform_text(train_messages, dictionary) np.savetxt('spam_sample_train_matrix', train_matrix[:100, :]) val_matrix = transform_text(val_messages, dictionary) test_matrix = transform_text(test_messages, dictionary) ### Q3.2 ### naive_bayes_model = fit_naive_bayes_model(train_matrix, train_labels) naive_bayes_predictions = predict_from_naive_bayes_model( naive_bayes_model, test_matrix) np.savetxt('spam_naive_bayes_predictions', naive_bayes_predictions) naive_bayes_accuracy = np.mean(naive_bayes_predictions == test_labels) print('Naive Bayes had an accuracy of {} on the testing set'.format( naive_bayes_accuracy)) top_5_words = get_top_five_naive_bayes_words(naive_bayes_model, dictionary) print('The top 5 indicative words for Naive Bayes are: ', top_5_words) ### Q3.4 ### optimal_radius = compute_best_svm_radius(train_matrix, train_labels, val_matrix, val_labels, [0.01, 0.1, 1, 10]) print('The optimal SVM radius was {}'.format(optimal_radius)) svm_predictions = svm.train_and_predict_svm(train_matrix, train_labels, test_matrix, optimal_radius) svm_accuracy = np.mean(svm_predictions == test_labels) print('The SVM model had an accuracy of {} on the testing set'.format( svm_accuracy, optimal_radius))
def compute_best_svm_radius(train_matrix, train_labels, val_matrix, val_labels, radius_to_consider): max_r = radius_to_consider[0] max_accuracy = 0 for radius in radius_to_consider: predictions = svm.train_and_predict_svm(train_matrix, train_labels, val_matrix, radius) accuracy = np.mean(predictions == val_labels) if accuracy > max_accuracy: max_r = radius max_accuracy = accuracy return max_r
def compute_best_svm_radius(train_matrix, train_labels, val_matrix, val_labels, radius_to_consider): """Compute the optimal SVM radius using the provided training and evaluation datasets. You should only consider radius values within the radius_to_consider list. You should use accuracy as a metric for comparing the different radius values. Args: train_matrix: The word counts for the training data train_labels: The spam or not spam labels for the training data val_matrix: The word counts for the validation data val_labels: The spam or not spam labels for the validation data radius_to_consider: The radius values to consider Returns: The best radius which maximizes SVM accuracy. """ # *** START CODE HERE *** accuracy = {} for radius in radius_to_consider: svm_labels = svm.train_and_predict_svm(train_matrix, train_labels, val_matrix, radius) accuracy[radius] = np.mean(svm_labels == val_labels) print("Radius, accuracy: ", radius, accuracy[radius]) return [k for k, v in sorted(accuracy.items(), key=lambda x: x[1])][-1]
def main(): train_messages, train_labels = util.load_spam_dataset('spam_train.tsv') val_messages, val_labels = util.load_spam_dataset('spam_val.tsv') test_messages, test_labels = util.load_spam_dataset('spam_test.tsv') dictionary = create_dictionary(train_messages) print('Size of dictionary: ', len(dictionary)) util.write_json('spam_dictionary', dictionary) train_matrix = transform_text(train_messages, dictionary) np.savetxt('spam_sample_train_matrix', train_matrix[:100,:]) val_matrix = transform_text(val_messages, dictionary) test_matrix = transform_text(test_messages, dictionary) naive_bayes_model = fit_naive_bayes_model(train_matrix, train_labels) naive_bayes_predictions = predict_from_naive_bayes_model(naive_bayes_model, test_matrix) np.savetxt('spam_naive_bayes_predictions', naive_bayes_predictions) naive_bayes_accuracy = np.mean(naive_bayes_predictions == test_labels) print('Naive Bayes had an accuracy of {} on the testing set'.format(naive_bayes_accuracy)) top_5_words = get_top_five_naive_bayes_words(naive_bayes_model, dictionary) print('The top 5 indicative words for Naive Bayes are: ', top_5_words) util.write_json('spam_top_indicative_words', top_5_words) optimal_radius = compute_best_svm_radius(train_matrix, train_labels, val_matrix, val_labels, [0.01, 0.1, 1, 10]) util.write_json('spam_optimal_radius', optimal_radius) print('The optimal SVM radius was {}'.format(optimal_radius)) svm_predictions = svm.train_and_predict_svm(train_matrix, train_labels, test_matrix, optimal_radius) svm_accuracy = np.mean(svm_predictions == test_labels) print('The SVM model had an accuracy of {} on the testing set'.format(svm_accuracy, optimal_radius)) train_matrix = util.load_bert_encoding('bert_train_matrix.tsv.bz2') val_matrix = util.load_bert_encoding('bert_val_matrix.tsv.bz2') test_matrix = util.load_bert_encoding('bert_test_matrix.tsv.bz2') best_learning_rate = compute_best_logreg_learning_rate(train_matrix, train_labels, val_matrix, val_labels, [0.01, 0.001, 0.0001, 0.00001, 0.000001]) print('The best learning rate for logistic regression is {}'.format(best_learning_rate)) logreg_predictions = logreg.train_and_predict_logreg(train_matrix, train_labels, test_matrix, best_learning_rate) logreg_accuracy = np.mean(logreg_predictions == test_labels) print('The Logistic Regression model with BERT encodings had an accuracy of {} on the testing set'.format(logreg_accuracy))