示例#1
0
def evaluate_neural_network(train_file_path: str, validate_file_path: str) -> list:
    """Runs the neural network and evaluates how well it classifies the sentences."""
    # Pull the sentences from the file and build the model
    list_of_sentence_label_pairs = file_augmentation.get_sentences_from_csv_file(validate_file_path)
    list_of_sentence_predicted_label_pairs = neural_net_from_file(train_file_path, validate_file_path, num_epochs=20)
    # Make some dictionaries for use later
    label_to_num_dict = {
        "Facts": 0,
        "Issue": 1,
        "Rule/Law/Holding": 2,
        "Analysis": 3,
        "Conclusion": 4,
        "Others": 5,         
    }
    label_dict = {
        0: "Facts",
        1: "Issue",
        2: "Rule/Law/Holding",
        3: "Analysis",
        4: "Conclusion",
        5: "Others"
    }
    confusion_matrix = [[0 for i in range(6)] for j in range(6)]

    return_list = list()
    # Now match predictions with accuracy, but first, make some variables for keeping track of things
    total_classified = 0 
    correct_classifications = 0
    # Now put everything back together as (sentence, label, predicted label)
    for index, pair in enumerate(list_of_sentence_predicted_label_pairs):
        sentence = pair[0]
        label = list_of_sentence_label_pairs[index][1]
        predicted_label = pair[1]
        # Package it up nicely
        sentence_label_predicted_label_tuple = (sentence, label, predicted_label)
        return_list.append(sentence_label_predicted_label_tuple)
        # Finally, update our confusion matrix and our counters for accuracy
        label_num = label_to_num_dict.get(label) 
        predicted_label_num = label_to_num_dict.get(predicted_label)
        confusion_matrix[label_num][predicted_label_num] = confusion_matrix[label_num][predicted_label_num] + 1
        # Update our counters as well
        total_classified += 1
        if label_num is predicted_label_num:
            correct_classifications +=1
    # Print the categories
    print("\n")
    for index, label_list in enumerate(confusion_matrix):
        print(label_dict.get(index), end="   ")
    # Print the overall accuracy
    accuracy = correct_classifications / total_classified
    print('\n')
    print(f"Classification accuracy, {correct_classifications} / {total_classified} correctly: {accuracy}")
    # Print the confusion matrix, with accuracy rates per label
    for index, label_list in enumerate(confusion_matrix):
        total_classifications_in_category = sum(label_list)
        correct_classifications = label_list[index]
        label_accuracy = correct_classifications / (total_classifications_in_category + 0.00001)
        print(label_list, end="    ")
        print(f"Accuracy for {label_dict.get(index)}, {correct_classifications} / {total_classifications_in_category} classified correctly: {label_accuracy}")
    return return_list
示例#2
0
def knn_strictly_from_file(k: int, file_path_to_sentences: str,
                           file_path_to_target_sentences: str) -> list:
    """Uses the sentences found at file_path_to_target_sentences to classify the sentences found at file_path_to_sentences.  
        Returns a list of (sentence, predicted-label) tuples."""
    # Read the sentences-to-predict from file and map them into the appropriate format
    list_of_sentences = file_augmentation.get_sentences_from_csv_file(
        file_path_to_sentences, sentences_only=True)
    # Now use the other KNN method
    return knn_from_file(k, list_of_sentences, file_path_to_target_sentences)
示例#3
0
def neural_net_from_file(train_data_path: str, predict_data_path: str, num_epochs=20) -> list:
    """Uses the data found at train_data_path to classify the data found at predict_data_path with a neural network.  
        Returns a list of (sentence, predicted-label) tuples."""
    # Read the data and process it into a good format
    data_pair = read_file_and_convert_to_model_data(train_data_path)
    training_data, training_labels = data_pair
    # Build and train the neural network
    model = build_model()
    trained_model = train_model(training_data, training_labels, num_epochs, model)
    # Get the sentences that we want to classify and do so
    list_of_sentences = file_augmentation.get_sentences_from_csv_file(predict_data_path, sentences_only=True)
    predicted_sentence_list = predict_sentence_labels(list_of_sentences, trained_model)
    return predicted_sentence_list
示例#4
0
def naive_bayes_from_file(file_path_to_train_sentences: str, file_path_to_test_sentences: str, binary_classification=False, alpha=1.0, norm=False) -> list:
    """Uses the sentences and labels from file_path_to_train_sentences to make a Naive Bayes classifier and then classifies
        the sentences found at file_path_to_test_sentences and returns a list of (sentence, predicted-label) pairs."""
    classifier = make_classifier_from_file_path(file_path_to_train_sentences, binary_classification, alpha, norm)
    test_sentences_with_labels = file_augmentation.get_sentences_from_csv_file(file_path_to_test_sentences)
    # Now split up the test sentences and labels
    list_of_sentences = list()
    for pair in test_sentences_with_labels:
        sentence = pair[0]
        list_of_sentences.append(sentence)
    # Now predict labels
    predictions = predict_sentence_labels(classifier, list_of_sentences, binary_classification)
    return predictions
示例#5
0
def make_classifier_from_file_path(file_path_to_train_sentences: str, binary_classification=False, alpha=1.0, norm=False):
    """Trains a Naive Bayes classifier and returns it.

    Inputs:
    - file_path_to_train_sentences (string): the file-path to the sentences and labels to use for training,
    - binary_classification (boolean): whether the classifier should do Facts or Non-fact classifaction or for each label,

    Outputs:
    - A trained Naive Bayes classifier"""
    # Read the file with the sentences and labels
    sentences = file_augmentation.get_sentences_from_csv_file(file_path_to_train_sentences)
    # Embed those as vectors
    vector_sentences = tfhub_embedding.embed_sentence_list(sentences)
    # Now make the classifier
    classifier = train_naive_bayes(vector_sentences, binary_classification, alpha, norm)
    return classifier
示例#6
0
def knn_from_file(
    k: int,
    list_of_sentences: list,
    file_path: str,
) -> list:
    """Uses KNN to classify each sentence in list_of_sentences using sentences and labels from the file at file_path, k is number of neighbors to check.  
        Returns a list of (sentence, label) tuples."""
    # Read the CSV file for target vectors
    sentence_label_pair_list = file_augmentation.get_sentences_from_csv_file(
        file_path)
    # Embed those sentences as the target vectors
    embedded_sentence_label_pair_list = tfhub_embedding.embed_sentence_list(
        sentence_label_pair_list)
    # Run KNN on the sentences with respect to the target vectors
    predicted_sentence_list = tag_sentences_via_knn(
        k, list_of_sentences, embedded_sentence_label_pair_list)
    return predicted_sentence_list
示例#7
0
def read_file_and_convert_to_model_data(file_path: str) -> tuple:
    """Reads the specified CSV file and converts the sentences to 512-dimensional vectors.  
    Returns a tuple of the form (list of sentence-as-vector, list of labels)."""
    # Read the file and turn it into a list
    sentence_label_pair_list = file_augmentation.get_sentences_from_csv_file(file_path)
    # Now embed the sentences as vectors
    vector_label_pair_list = tfhub_embedding.embed_sentence_list(sentence_label_pair_list)
    # Make the two lists to keep track of things
    vector_tensor_list = list() 
    label_tensor_list = list() 
    # Dictionary for classifying labels as numbers
    """CHANGED THIS FOR BINARY CLASSIFICATION"""
    label_to_num_dict = {
        "Facts": 0,
        "Issue": 1,
        "Rule/Law/Holding": 2,
        "Analysis": 3,
        "Conclusion": 4,
        "Others": 5,
    }

    # Now split the list of tuples into a tuple of lists
    for index,pair in enumerate(vector_label_pair_list):
        vector = pair[0]
        label = pair[1]
        # Turn the vector into a tensor
        vector_tensor = item_to_tensor(vector)
        # Get a number classification for the label
        label_tensor = label_to_num_dict.get(label)
        # Add them to their respective lists
        vector_tensor_list.append(vector_tensor[0]) # The vector tensor lives inside of a redundant list, so take it out of that
        label_tensor_list.append(label_tensor)
    # Now package the lists into a tuple after converting them to Numpy arrays and return it
    # Additionally, add a "fake" dimension so that the #dims = 3 for specific TensorFlow layers
    vector_tensor_list = np.expand_dims(np.array(vector_tensor_list), axis=1)
    label_tensor_list = np.expand_dims(np.array(label_tensor_list), axis=1)
    return (vector_tensor_list, label_tensor_list)
示例#8
0
def evaluate_knn_with_confusion_matrix(
        k: int, file_path_to_validation_sentences: str,
        file_path_to_target_sentences: str) -> list:
    """Evaluates the KNN classifier by printing the confusion matrix for that classification attempt.  Also returns a list of 
        (sentence, label, predicted-label) tuples.  
        Validation sentences: the sentences whose class you wish to predict.
        Target sentences: the sentences you're using as neighbors to predict classifications."""
    # Make a 2-d array for the confusion matrix, 6x6 matrix filled with 0s
    confusion_matrix = [[0 for i in range(6)] for j in range(6)]
    #confusion_matrix = [[0 for i in range(2)] for j in range(2)] """CHANGED FOR BINARY"""
    # Two dictionaries for encoding/decoding labels and positions in lists
    """THIS IS WHAT WAS CHANGED FOR BINARY"""
    label_to_num_dict = {
        "Facts": 0,
        "Issue": 1,
        "Rule/Law/Holding": 2,
        "Analysis": 3,
        "Conclusion": 4,
        "Others": 5,
    }
    label_dict = {
        0: "Facts",
        1: "Issue",
        2: "Rule/Law/Holding",
        3: "Analysis",
        4: "Conclusion",
        5: "Others"
    }
    # Read the validation sentences into a list of (sentence, actual-label) pairs
    list_of_sentence_label_pairs = file_augmentation.get_sentences_from_csv_file(
        file_path_to_validation_sentences)
    # Now get just the sentences from that and throw them into the KNN classifier
    list_of_validation_sentences = list(
        map(lambda pair: pair[0], list_of_sentence_label_pairs.copy()))
    list_of_sentence_predicted_label_pairs = knn_from_file(
        k, list_of_validation_sentences, file_path_to_target_sentences)
    # Make a list to add things too
    return_list = list()
    # Counters for accuracy
    total_classified = 0
    correct_classifications = 0
    # Now put everything back together as (sentence, label, predicted label)
    for index, pair in enumerate(list_of_sentence_predicted_label_pairs):
        sentence = pair[0]
        label = list_of_sentence_label_pairs[index][1]
        predicted_label = pair[1]
        # Package it up nicely
        sentence_label_predicted_label_tuple = (sentence, label,
                                                predicted_label)
        return_list.append(sentence_label_predicted_label_tuple)
        # Finally, update our confusion matrix and our counters for accuracy
        label_num = label_to_num_dict.get(label)
        predicted_label_num = label_to_num_dict.get(predicted_label)
        confusion_matrix[label_num][predicted_label_num] = confusion_matrix[
            label_num][predicted_label_num] + 1
        # Update our counters as well
        total_classified += 1
        if label_num is predicted_label_num:
            correct_classifications += 1
    # Print the categories
    for index, label_list in enumerate(confusion_matrix):
        print(label_dict.get(index), end="   ")
    # Print the overall accuracy
    accuracy = correct_classifications / total_classified
    print('\n')
    print(
        f"Classification accuracy with k = {k}, {correct_classifications} / {total_classified}: {accuracy}"
    )
    # Print the confusion matrix, with accuracy rates per label
    for index, label_list in enumerate(confusion_matrix):
        total_classifications_in_category = sum(label_list)
        correct_classifications = label_list[index]
        label_accuracy = correct_classifications / (
            total_classifications_in_category + 0.00001
        )  # Prevent zero division error
        print(label_list, end="    ")
        print(
            f"Accuracy for {label_dict.get(index)}, {correct_classifications} / {total_classifications_in_category} classified correctly: {label_accuracy}"
        )
    return return_list
示例#9
0
def evaluate_naive_bayes(train_file_path: str, validate_file_path: str, binary_classification: bool, alpha=1.0, norm=False) -> list:
    """Evaluates how accurate the Naive Bayes classifier is on already classified data.  Returns a list of 
        (sentence, label, predicted-label) tuples."""
    # Dictionaries and lists that depend on whether the classifcation is binary or not
    if binary_classification is True:
        label_to_num_dict = {
            "Facts": 0,
            "Issue": 1,
            "Rule/Law/Holding": 1,
            "Analysis": 1,
            "Conclusion": 1,
            "Others": 1, 
            "Non-fact": 1,          
        }
        label_dict = {
            0: "Facts",
            1: "Non-fact"
        }
        confusion_matrix = [[0 for i in range(2)] for j in range(2)]
    else:
        label_to_num_dict = {
            "Facts": 0,
            "Issue": 1,
            "Rule/Law/Holding": 2,
            "Analysis": 3,
            "Conclusion": 4,
            "Others": 5,
        }
        label_dict = {
            0: "Facts",
            1: "Issue",
            2: "Rule/Law/Holding",
            3: "Analysis",
            4: "Conclusion",
            5: "Others"
        }
        confusion_matrix = [[0 for i in range(6)] for j in range(6)]
    # Make a classifier with the training data
    classifier = make_classifier_from_file_path(train_file_path, binary_classification, alpha, norm)
    # Get the validation sentences
    list_of_sentence_label_pairs = file_augmentation.get_sentences_from_csv_file(validate_file_path)
    list_of_sentences = list()
    for pair in list_of_sentence_label_pairs:
        sentence = pair[0]
        list_of_sentences.append(sentence)
    # Predict some labels
    list_of_sentence_predicted_label_pairs = predict_sentence_labels(classifier, list_of_sentences, binary_classification)
    return_list = list()
    # Now match predictions with accuracy, but first, make some variables for keeping track of things
    total_classified = 0 
    correct_classifications = 0
    # Now put everything back together as (sentence, label, predicted label)
    for index, pair in enumerate(list_of_sentence_predicted_label_pairs):
        sentence = pair[0]
        label = list_of_sentence_label_pairs[index][1]
        predicted_label = pair[1]
        # Package it up nicely
        sentence_label_predicted_label_tuple = (sentence, label, predicted_label)
        return_list.append(sentence_label_predicted_label_tuple)
        # Finally, update our confusion matrix and our counters for accuracy
        label_num = label_to_num_dict.get(label) 
        predicted_label_num = label_to_num_dict.get(predicted_label)
        confusion_matrix[label_num][predicted_label_num] = confusion_matrix[label_num][predicted_label_num] + 1
        # Update our counters as well
        total_classified += 1
        if label_num is predicted_label_num:
            correct_classifications +=1
    # Print the categories
    print("\n")
    for index, label_list in enumerate(confusion_matrix):
        print(label_dict.get(index), end="   ")
    # Print the overall accuracy
    accuracy = correct_classifications / total_classified
    print('\n')
    print(f"Alpha: {alpha}")
    print(f"Classification accuracy: {accuracy}")
    # Print the confusion matrix, with accuracy rates per label
    for index, label_list in enumerate(confusion_matrix):
        total_classifications_in_category = sum(label_list)
        correct_classifications = label_list[index]
        label_accuracy = correct_classifications / (total_classifications_in_category + 0.00001) # Prevent zero division error
        print(label_list, end="    ")
        print(f"Accuracy for {label_dict.get(index)}, {correct_classifications} / {total_classifications_in_category} classified correctly: {label_accuracy}")
    return return_list
示例#10
0
def evaluate_cerberus_binary(train_data_path,
                             predict_data_path,
                             k=5,
                             num_epochs=20) -> list:
    """Runs Cerberus on the provided inputs and evaluates its performance.  Returns a list of
        (sentence, label, predicted-label) tuples."""
    # Get our sentences with labels attached and also run Ceberus
    list_of_sentence_label_pairs = file_augmentation.get_sentences_from_csv_file(
        predict_data_path)
    list_of_sentence_predicted_label_pairs = run_cerberus_binary(
        train_data_path, predict_data_path, k, num_epochs)
    # Make some dictionaries for use later
    label_to_num_dict = {
        "Facts": 0,
        "Issue": 1,
        "Rule/Law/Holding": 1,
        "Analysis": 1,
        "Conclusion": 1,
        "Others": 1,
        "Non-fact": 1,
    }
    label_dict = {0: "Facts", 1: "Non-fact"}
    confusion_matrix = [[0 for i in range(2)] for j in range(2)]

    return_list = list()
    # Now match predictions with accuracy, but first, make some variables for keeping track of things
    total_classified = 0
    correct_classifications = 0
    # Now put everything back together as (sentence, label, predicted label)
    for index, pair in enumerate(list_of_sentence_predicted_label_pairs):
        sentence = pair[0]
        label = list_of_sentence_label_pairs[index][1]
        predicted_label = pair[1]
        # Package it up nicely
        sentence_label_predicted_label_tuple = (sentence, label,
                                                predicted_label)
        return_list.append(sentence_label_predicted_label_tuple)
        # Finally, update our confusion matrix and our counters for accuracy
        label_num = label_to_num_dict.get(label)
        predicted_label_num = label_to_num_dict.get(predicted_label)
        confusion_matrix[label_num][predicted_label_num] = confusion_matrix[
            label_num][predicted_label_num] + 1
        # Update our counters as well
        total_classified += 1
        if label_num is predicted_label_num:
            correct_classifications += 1
    # Print the categories
    print("\n")
    for index, label_list in enumerate(confusion_matrix):
        print(label_dict.get(index), end="   ")
    # Print the overall accuracy
    accuracy = correct_classifications / total_classified
    print('\n')
    print(
        f"Classification accuracy, {correct_classifications} / {total_classified} correct: {accuracy}"
    )
    # Print the confusion matrix, with accuracy rates per label
    for index, label_list in enumerate(confusion_matrix):
        total_classifications_in_category = sum(label_list)
        correct_classifications = label_list[index]
        label_accuracy = correct_classifications / total_classifications_in_category
        print(label_list, end="    ")
        print(
            f"Accuracy for {label_dict.get(index)}, {correct_classifications} / {total_classifications_in_category} classified correctly: {label_accuracy}"
        )
    return return_list