예제 #1
0
def compute_best_svm_radius(train_matrix, train_labels, val_matrix, val_labels,
                            radius_to_consider):
    """Compute the optimal SVM radius using the provided training and evaluation datasets.

    You should only consider radius values within the radius_to_consider list.
    You should use accuracy as a metric for comparing the different radius values.

    Args:
        train_matrix: The word counts for the training data
        train_labels: The spma or not spam labels for the training data
        val_matrix: The word counts for the validation data
        val_labels: The spam or not spam labels for the validation data
        radius_to_consider: The radius values to consider

    Returns:
        The best radius which maximizes SVM accuracy.
    """
    # *** START CODE HERE ***
    radius_to_consider
    best_rad = 0.0
    best_acc = 0.0
    for radius in radius_to_consider:
        val_preds = svm.train_and_predict_svm(train_matrix, train_labels,
                                              val_matrix, radius)
        acc = float(np.sum(val_preds == val_labels)) / val_preds.shape[0]
        print(radius, acc)
        if (acc > best_acc):
            best_acc = acc
            best_rad = radius
    return best_rad
예제 #2
0
def compute_best_svm_radius(train_matrix, train_labels, val_matrix, val_labels,
                            radius_to_consider):
    """Compute the optimal SVM radius using the provided training and evaluation datasets.

    You should only consider radius values within the radius_to_consider list.
    You should use accuracy as a metric for comparing the different radius values.

    Args:
        train_matrix: The word counts for the training data
        train_labels: The spma or not spam labels for the training data
        val_matrix: The word counts for the validation data
        val_labels: The spam or not spam labels for the validation data
        radius_to_consider: The radius values to consider

    Returns:
        The best radius which maximizes SVM accuracy.
    """
    # *** START CODE HERE ***
    top_accuracy = 0
    top_radi = 0
    for radi in radius_to_consider:
        cur_preds = svm.train_and_predict_svm(train_matrix, train_labels,
                                              val_matrix, radi)
        acc = np.mean(cur_preds == val_labels)
        if acc >= top_accuracy:
            top_radi = radi

    return top_radi
예제 #3
0
def compute_best_svm_radius(train_matrix, train_labels, val_matrix, val_labels, radius_to_consider):
    """Compute the optimal SVM radius using the provided training and evaluation datasets.

    You should only consider radius values within the radius_to_consider list.
    You should use accuracy as a metric for comparing the different radius values.

    Args:
        train_matrix: The word counts for the training data
        train_labels: The spma or not spam labels for the training data
        val_matrix: The word counts for the validation data
        val_labels: The spam or not spam labels for the validation data
        radius_to_consider: The radius values to consider

    Returns:
        The best radius which maximizes SVM accuracy.
    """
    # *** START CODE HERE ***
    best_radius = None
    for radius_value in radius_to_consider:
        svm_predictions = svm.train_and_predict_svm(train_matrix, train_labels, val_matrix, radius_value)
        if best_radius is None:
            best_radius = (np.mean(svm_predictions == val_labels), radius_value)
        else:
            best_radius = max(best_radius, (np.mean(svm_predictions == val_labels), radius_value))
    return best_radius[1]
예제 #4
0
def main():
    train_tweets, val_tweets, test_tweets, train_labels, val_labels, test_labels = load_dataset("final_data/compiled_data.csv")
    dictionary = create_dictionary(train_tweets)
    util.write_json('./output/dictionary', dictionary)
    train_matrix = transform_text(train_tweets, dictionary)
    val_matrix = transform_text(val_tweets, dictionary)
    test_matrix = transform_text(test_tweets, dictionary)

    naive_bayes_model = fit_naive_bayes_model(train_matrix, train_labels)
    naive_bayes_predictions = predict_from_naive_bayes_model(naive_bayes_model, test_matrix)
    naive_bayes_accuracy = np.mean(naive_bayes_predictions == test_labels)
    print("naive_bayes_results: ")
    unique, counts = np.unique(naive_bayes_predictions, return_counts=True)
    print(dict(zip(unique, counts)))
    print("test_labels: " )
    unique, counts = np.unique(test_labels, return_counts=True)
    print(dict(zip(unique, counts)))

    print('Naive Bayes had an accuracy of {} on the testing set'.format(naive_bayes_accuracy))
    top_5_words = get_top_five_naive_bayes_words(naive_bayes_model, dictionary)
    print('The top 5 indicative words for Naive Bayes are: ', top_5_words)

    optimal_radius = compute_best_svm_radius(train_matrix, train_labels, val_matrix, val_labels, [0.01, 0.1, 1, 10])
    util.write_json('./output/p06_optimal_radius', optimal_radius)
    print('The optimal SVM radius was {}'.format(optimal_radius))
    svm_predictions = svm.train_and_predict_svm(train_matrix, train_labels, test_matrix, optimal_radius)
    svm_accuracy = np.mean(svm_predictions == test_labels)
    print('The SVM model had an accuracy of {} on the testing set'.format(svm_accuracy, optimal_radius))
예제 #5
0
def main():
    train_messages, train_labels = util.load_spam_dataset('spam_train.tsv')
    val_messages, val_labels = util.load_spam_dataset('spam_val.tsv')
    test_messages, test_labels = util.load_spam_dataset('spam_test.tsv')

    dictionary = create_dictionary(train_messages)

    print('Size of dictionary: ', len(dictionary))

    train_matrix = transform_text(train_messages, dictionary)
    val_matrix = transform_text(val_messages, dictionary)
    test_matrix = transform_text(test_messages, dictionary)

    optimal_radius = compute_best_svm_radius(train_matrix, train_labels,
                                             val_matrix, val_labels,
                                             [0.01, 0.1, 1, 10])

    print('The optimal SVM radius was {}'.format(optimal_radius))

    svm_predictions = svm.train_and_predict_svm(train_matrix, train_labels,
                                                test_matrix, optimal_radius)
    np.savetxt('svm_predictions.txt', svm_predictions)

    svm_accuracy = np.mean(svm_predictions == test_labels)

    print('The SVM model had an accuracy of {} on the testing set'.format(
        svm_accuracy))
예제 #6
0
def compute_best_svm_radius(train_matrix, train_labels, val_matrix, val_labels,
                            radius_to_consider):
    """Compute the optimal SVM radius using the provided training and evaluation datasets.

    You should only consider radius values within the radius_to_consider list.
    You should use accuracy as a metric for comparing the different radius values.

    Args:
        train_matrix: The word counts for the training data
        train_labels: The spma or not spam labels for the training data
        val_matrix: The word counts for the validation data
        val_labels: The spam or not spam labels for the validation data
        radius_to_consider: The radius values to consider

    Returns:
        The best radius which maximizes SVM accuracy.
    """
    SVM_radio = (-1.0, -1.0)
    for radio in radius_to_consider:
        svm_predictions = svm.train_and_predict_svm(train_matrix, train_labels,
                                                    val_matrix, radio)
        svm_accuracy = np.mean(svm_predictions == val_labels)
        if svm_accuracy > SVM_radio[1]:
            SVM_radio = (radio, svm_accuracy)
    return SVM_radio[0]
예제 #7
0
파일: p06_spam.py 프로젝트: mtjin96/CS229
def compute_best_svm_radius(train_matrix, train_labels, val_matrix, val_labels,
                            radius_to_consider):
    """Compute the optimal SVM radius using the provided training and evaluation datasets.

    You should only consider radius values within the radius_to_consider list.
    You should use accuracy as a metric for comparing the different radius values.

    Args:
        train_matrix: The word counts for the training data
        train_labels: The spam or not spam labels for the training data
        eval_matrix: The word counts for the validation data
        eval_labels: The spam or not spam labels for the validation data
        radius_to_consider: The radius values to consider
    
    Returns:
        The best radius which maximizes SVM accuracy.
    """
    # *** START CODE HERE ***
    accuracy_old = 0
    rad_max = radius_to_consider[0]
    for rad in radius_to_consider:
        pred = svm.train_and_predict_svm(train_matrix, train_labels,
                                         val_matrix, rad)
        accuracy = np.mean(pred == val_labels)
        if accuracy > accuracy_old:
            accuracy_old = accuracy
            rad_max = rad
    return rad_max
예제 #8
0
def compute_best_svm_radius(train_matrix, train_labels, val_matrix, val_labels,
                            radius_to_consider):
    """Compute the optimal SVM radius using the provided training and evaluation datasets.

    You should only consider radius values within the radius_to_consider list.
    You should use accuracy as a metric for comparing the different radius values.

    Args:
        train_matrix: The word counts for the training data
        train_labels: The spma or not spam labels for the training data
        val_matrix: The word counts for the validation data
        val_labels: The spam or not spam labels for the validation data
        radius_to_consider: The radius values to consider

    Returns:
        The best radius which maximizes SVM accuracy.
    """
    # *** START CODE HERE ***
    outputs = {}
    i = 0
    for radius in radius_to_consider:
        output = svm.train_and_predict_svm(train_matrix, train_labels,
                                           val_matrix, radius)
        accuracy = np.mean(output == val_labels)
        outputs[radius] = accuracy
    best_radius = max(outputs, key=outputs.get)
    return best_radius
예제 #9
0
def compute_best_svm_radius(train_matrix, train_labels, val_matrix, val_labels,
                            radius_to_consider):
    """Compute the optimal SVM radius using the provided training and evaluation datasets.

    You should only consider radius values within the radius_to_consider list.
    You should use accuracy as a metric for comparing the different radius values.

    Args:
        train_matrix: The word counts for the training data
        train_labels: The spma or not spam labels for the training data
        eval_matrix: The word counts for the validation data
        eval_labels: The spam or not spam labels for the validation data
        radius_to_consider: The radius values to consider
    
    Returns:
        The best radius which maximizes SVM accuracy.
    """
    # *** START CODE HERE ***
    best_radius, best_acc = 0.0, 0.0

    for radius in radius_to_consider:
        pred = svm.train_and_predict_svm(train_matrix, train_labels,
                                         val_matrix, radius)
        acc = (pred == val_labels).sum() / len(pred)
        if acc > best_acc:
            best_radius = radius
            best_acc = acc
    return best_radius
def compute_best_svm_radius(train_matrix, train_labels, val_matrix, val_labels,
                            radius_to_consider):
    """Compute the optimal SVM radius using the provided training and evaluation datasets.

    You should only consider radius values within the radius_to_consider list.
    You should use accuracy as a metric for comparing the different radius values.

    Args:
        train_matrix: The word counts for the training data
        train_labels: The spma or not spam labels for the training data
        val_matrix: The word counts for the validation data
        val_labels: The spam or not spam labels for the validation data
        radius_to_consider: The radius values to consider
    
    Returns:
        The best radius which maximizes SVM accuracy.
    """
    # *** START CODE HERE ***
    accuracy_list = []
    for radius in radius_to_consider:
        pred_y = svm.train_and_predict_svm(train_matrix, train_labels,
                                           val_matrix, radius)
        accuracy_list.append(np.mean(val_labels == pred_y))
    idx = np.argmax(accuracy_list)
    return radius_to_consider[idx]
예제 #11
0
def compute_best_svm_radius(train_matrix, train_labels, val_matrix, val_labels, radius_to_consider):
    """Compute the optimal SVM radius using the provided training and evaluation datasets.

    You should only consider radius values within the radius_to_consider list.
    You should use accuracy as a metric for comparing the different radius values.

    Args:
        train_matrix: The word counts for the training data
        train_labels: The spma or not spam labels for the training data
        val_matrix: The word counts for the validation data
        val_labels: The spam or not spam labels for the validation data
        radius_to_consider: The radius values to consider

    Returns:
        The best radius which maximizes SVM accuracy.
    """
    best_rad = None
    best_acc = 0
    for rad in radius_to_consider:
        valid_preds = svm.train_and_predict_svm(train_matrix, train_labels, 
                                                val_matrix, rad)
        valid_acc = np.mean(valid_preds == val_labels)
        if valid_acc > best_acc:
            best_rad = rad
            best_acc = valid_acc
    return best_rad
예제 #12
0
def compute_best_svm_radius(train_matrix, train_labels, val_matrix, val_labels, radius_to_consider):
    """Compute the optimal SVM radius using the provided training and evaluation datasets.

    You should only consider radius values within the radius_to_consider list.
    You should use accuracy as a metric for comparing the different radius values.

    Args:
        train_matrix: The word counts for the training data
        train_labels: The spma or not spam labels for the training data
        eval_matrix: The word counts for the validation data
        eval_labels: The spam or not spam labels for the validation data
        radius_to_consider: The radius values to consider
    
    Returns:
        The best radius which maximizes SVM accuracy.
    """
    opt_radius = 0
    accuracy = 0
    for i in range(len(radius_to_consider)):
        output = svm.train_and_predict_svm(train_matrix, train_labels, val_matrix, radius_to_consider[i])
        correct = 0
        for j in range(len(val_labels)):
            if val_labels[j] == output[j]:
                correct += 1
        if accuracy < (correct / len(val_labels)):
            opt_radius = radius_to_consider[i]
            accuracy = correct / len(val_labels)
    return opt_radius
예제 #13
0
def main():
    train_messages, train_labels = util.load_spam_dataset('spam_train.tsv')
    val_messages, val_labels = util.load_spam_dataset('spam_val.tsv')
    test_messages, test_labels = util.load_spam_dataset('spam_test.tsv')

    ### Q3.1 ###
    dictionary = create_dictionary(train_messages)

    print('Size of dictionary: ', len(dictionary))

    util.write_json('spam_dictionary', dictionary)

    train_matrix = transform_text(train_messages, dictionary)

    np.savetxt('spam_sample_train_matrix', train_matrix[:100, :])

    val_matrix = transform_text(val_messages, dictionary)
    test_matrix = transform_text(test_messages, dictionary)

    ### Q3.2 ###
    naive_bayes_model = fit_naive_bayes_model(train_matrix, train_labels)

    naive_bayes_predictions = predict_from_naive_bayes_model(
        naive_bayes_model, test_matrix)

    np.savetxt('spam_naive_bayes_predictions', naive_bayes_predictions)

    naive_bayes_accuracy = np.mean(naive_bayes_predictions == test_labels)

    print('Naive Bayes had an accuracy of {} on the testing set'.format(
        naive_bayes_accuracy))

    ### Q3.3 ###
    top_5_words = get_top_five_naive_bayes_words(naive_bayes_model, dictionary)

    print('The top 5 indicative words for Naive Bayes are: ', top_5_words)

    util.write_json('spam_top_indicative_words', top_5_words)

    ### Q3.4 ###
    optimal_radius = compute_best_svm_radius(train_matrix, train_labels,
                                             val_matrix, val_labels,
                                             [0.01, 0.1, 1, 10])
    # optimal_radius = compute_best_svm_radius(train_matrix, train_labels, val_matrix, val_labels, [0.1])

    util.write_json('spam_optimal_radius', optimal_radius)

    print('The optimal SVM radius was {}'.format(optimal_radius))

    svm_predictions = svm.train_and_predict_svm(train_matrix, train_labels,
                                                test_matrix, optimal_radius)

    svm_accuracy = np.mean(svm_predictions == test_labels)

    print('The SVM model had an accuracy of {} on the testing set'.format(
        svm_accuracy, optimal_radius))
예제 #14
0
def main():
    train_messages, train_labels = util.load_spam_dataset(
        '../data/ds6_train.tsv')
    val_messages, val_labels = util.load_spam_dataset('../data/ds6_val.tsv')
    test_messages, test_labels = util.load_spam_dataset('../data/ds6_test.tsv')

    dictionary = create_dictionary(train_messages)

    util.write_json('./output/p06_dictionary', dictionary)

    train_matrix = transform_text(train_messages, dictionary)

    np.savetxt('./output/p06_sample_train_matrix', train_matrix[:100, :])

    val_matrix = transform_text(val_messages, dictionary)
    test_matrix = transform_text(test_messages, dictionary)

    naive_bayes_model = fit_naive_bayes_model(train_matrix, train_labels)

    naive_bayes_predictions = predict_from_naive_bayes_model(
        naive_bayes_model, test_matrix)

    np.savetxt('./output/p06_naive_bayes_predictions', naive_bayes_predictions)

    naive_bayes_accuracy = np.mean(naive_bayes_predictions == test_labels)

    print('Naive Bayes had an accuracy of {} on the testing set'.format(
        naive_bayes_accuracy))

    top_5_words = get_top_five_naive_bayes_words(naive_bayes_model, dictionary)

    print('The top 5 indicative words for Naive Bayes are: ', top_5_words)

    util.write_json('./output/p06_top_indicative_words', top_5_words)

    optimal_radius = compute_best_svm_radius(train_matrix, train_labels,
                                             val_matrix, val_labels,
                                             [0.01, 0.1, 1, 10])

    util.write_json('./output/p06_optimal_radius', optimal_radius)

    print('The optimal SVM radius was {}'.format(optimal_radius))

    svm_predictions = svm.train_and_predict_svm(train_matrix, train_labels,
                                                test_matrix, optimal_radius)

    svm_accuracy = np.mean(svm_predictions == test_labels)

    print('The SVM model had an accuracy of {} on the testing set'.format(
        svm_accuracy, optimal_radius))
    end = time.time()
    print("Execution Time: ", end - start)
예제 #15
0
def experimenting_without_punctuation():
    train_messages, train_labels = util.load_spam_dataset('spam_train.tsv')
    val_messages, val_labels = util.load_spam_dataset('spam_val.tsv')
    test_messages, test_labels = util.load_spam_dataset('spam_test.tsv')

    print(
        'EXPERIMENT: WHAT HAPPENS WHEN WE DELETE PUNCTUATION FROM OUR MESSAGES'
    )

    ### Q3.1 ###
    dictionary = create_dictionary(train_messages)

    print('Size of dictionary: ', len(dictionary))

    train_matrix = transform_text(train_messages, dictionary)

    np.savetxt('spam_sample_train_matrix', train_matrix[:100, :])

    val_matrix = transform_text(val_messages, dictionary)
    test_matrix = transform_text(test_messages, dictionary)

    ### Q3.2 ###
    naive_bayes_model = fit_naive_bayes_model(train_matrix, train_labels)

    naive_bayes_predictions = predict_from_naive_bayes_model(
        naive_bayes_model, test_matrix)

    np.savetxt('spam_naive_bayes_predictions', naive_bayes_predictions)

    naive_bayes_accuracy = np.mean(naive_bayes_predictions == test_labels)

    print('Naive Bayes had an accuracy of {} on the testing set'.format(
        naive_bayes_accuracy))

    top_5_words = get_top_five_naive_bayes_words(naive_bayes_model, dictionary)

    print('The top 5 indicative words for Naive Bayes are: ', top_5_words)

    ### Q3.4 ###
    optimal_radius = compute_best_svm_radius(train_matrix, train_labels,
                                             val_matrix, val_labels,
                                             [0.01, 0.1, 1, 10])

    print('The optimal SVM radius was {}'.format(optimal_radius))

    svm_predictions = svm.train_and_predict_svm(train_matrix, train_labels,
                                                test_matrix, optimal_radius)

    svm_accuracy = np.mean(svm_predictions == test_labels)

    print('The SVM model had an accuracy of {} on the testing set'.format(
        svm_accuracy, optimal_radius))
예제 #16
0
def compute_best_svm_radius(train_matrix, train_labels, val_matrix, val_labels,
                            radius_to_consider):

    max_r = radius_to_consider[0]
    max_accuracy = 0
    for radius in radius_to_consider:
        predictions = svm.train_and_predict_svm(train_matrix, train_labels,
                                                val_matrix, radius)
        accuracy = np.mean(predictions == val_labels)
        if accuracy > max_accuracy:
            max_r = radius
            max_accuracy = accuracy
    return max_r
예제 #17
0
def compute_best_svm_radius(train_matrix, train_labels, val_matrix, val_labels, radius_to_consider):
    """Compute the optimal SVM radius using the provided training and evaluation datasets.

    You should only consider radius values within the radius_to_consider list.
    You should use accuracy as a metric for comparing the different radius values.

    Args:
        train_matrix: The word counts for the training data
        train_labels: The spam or not spam labels for the training data
        val_matrix: The word counts for the validation data
        val_labels: The spam or not spam labels for the validation data
        radius_to_consider: The radius values to consider
    
    Returns:
        The best radius which maximizes SVM accuracy.
    """
    # *** START CODE HERE ***
    accuracy = {}
    for radius in radius_to_consider:
        svm_labels = svm.train_and_predict_svm(train_matrix, train_labels, val_matrix, radius)
        accuracy[radius] = np.mean(svm_labels == val_labels)
        print("Radius, accuracy: ", radius, accuracy[radius])
    return [k for k, v in sorted(accuracy.items(), key=lambda x: x[1])][-1]
예제 #18
0
def main():
    train_messages, train_labels = util.load_spam_dataset('spam_train.tsv')
    val_messages, val_labels = util.load_spam_dataset('spam_val.tsv')
    test_messages, test_labels = util.load_spam_dataset('spam_test.tsv')

    dictionary = create_dictionary(train_messages)

    print('Size of dictionary: ', len(dictionary))

    util.write_json('spam_dictionary', dictionary)

    train_matrix = transform_text(train_messages, dictionary)

    np.savetxt('spam_sample_train_matrix', train_matrix[:100,:])

    val_matrix = transform_text(val_messages, dictionary)
    test_matrix = transform_text(test_messages, dictionary)

    naive_bayes_model = fit_naive_bayes_model(train_matrix, train_labels)

    naive_bayes_predictions = predict_from_naive_bayes_model(naive_bayes_model, test_matrix)

    np.savetxt('spam_naive_bayes_predictions', naive_bayes_predictions)

    naive_bayes_accuracy = np.mean(naive_bayes_predictions == test_labels)

    print('Naive Bayes had an accuracy of {} on the testing set'.format(naive_bayes_accuracy))

    top_5_words = get_top_five_naive_bayes_words(naive_bayes_model, dictionary)

    print('The top 5 indicative words for Naive Bayes are: ', top_5_words)

    util.write_json('spam_top_indicative_words', top_5_words)


    optimal_radius = compute_best_svm_radius(train_matrix, train_labels, val_matrix, val_labels, [0.01, 0.1, 1, 10])

    util.write_json('spam_optimal_radius', optimal_radius)

    print('The optimal SVM radius was {}'.format(optimal_radius))

    svm_predictions = svm.train_and_predict_svm(train_matrix, train_labels, test_matrix, optimal_radius)

    svm_accuracy = np.mean(svm_predictions == test_labels)

    print('The SVM model had an accuracy of {} on the testing set'.format(svm_accuracy, optimal_radius))


    train_matrix = util.load_bert_encoding('bert_train_matrix.tsv.bz2')
    val_matrix = util.load_bert_encoding('bert_val_matrix.tsv.bz2')
    test_matrix = util.load_bert_encoding('bert_test_matrix.tsv.bz2')
    
    best_learning_rate = compute_best_logreg_learning_rate(train_matrix, train_labels, val_matrix, val_labels, [0.01, 0.001, 0.0001, 0.00001, 0.000001])

    print('The best learning rate for logistic regression is {}'.format(best_learning_rate))

    logreg_predictions = logreg.train_and_predict_logreg(train_matrix, train_labels, test_matrix, best_learning_rate)

    logreg_accuracy = np.mean(logreg_predictions == test_labels)

    print('The Logistic Regression model with BERT encodings had an accuracy of {} on the testing set'.format(logreg_accuracy))