def evaluate_neural_network(train_file_path: str, validate_file_path: str) -> list: """Runs the neural network and evaluates how well it classifies the sentences.""" # Pull the sentences from the file and build the model list_of_sentence_label_pairs = file_augmentation.get_sentences_from_csv_file(validate_file_path) list_of_sentence_predicted_label_pairs = neural_net_from_file(train_file_path, validate_file_path, num_epochs=20) # Make some dictionaries for use later label_to_num_dict = { "Facts": 0, "Issue": 1, "Rule/Law/Holding": 2, "Analysis": 3, "Conclusion": 4, "Others": 5, } label_dict = { 0: "Facts", 1: "Issue", 2: "Rule/Law/Holding", 3: "Analysis", 4: "Conclusion", 5: "Others" } confusion_matrix = [[0 for i in range(6)] for j in range(6)] return_list = list() # Now match predictions with accuracy, but first, make some variables for keeping track of things total_classified = 0 correct_classifications = 0 # Now put everything back together as (sentence, label, predicted label) for index, pair in enumerate(list_of_sentence_predicted_label_pairs): sentence = pair[0] label = list_of_sentence_label_pairs[index][1] predicted_label = pair[1] # Package it up nicely sentence_label_predicted_label_tuple = (sentence, label, predicted_label) return_list.append(sentence_label_predicted_label_tuple) # Finally, update our confusion matrix and our counters for accuracy label_num = label_to_num_dict.get(label) predicted_label_num = label_to_num_dict.get(predicted_label) confusion_matrix[label_num][predicted_label_num] = confusion_matrix[label_num][predicted_label_num] + 1 # Update our counters as well total_classified += 1 if label_num is predicted_label_num: correct_classifications +=1 # Print the categories print("\n") for index, label_list in enumerate(confusion_matrix): print(label_dict.get(index), end=" ") # Print the overall accuracy accuracy = correct_classifications / total_classified print('\n') print(f"Classification accuracy, {correct_classifications} / {total_classified} correctly: {accuracy}") # Print the confusion matrix, with accuracy rates per label for index, label_list in enumerate(confusion_matrix): total_classifications_in_category = sum(label_list) correct_classifications = label_list[index] label_accuracy = correct_classifications / (total_classifications_in_category + 0.00001) print(label_list, end=" ") print(f"Accuracy for {label_dict.get(index)}, {correct_classifications} / {total_classifications_in_category} classified correctly: {label_accuracy}") return return_list
def knn_strictly_from_file(k: int, file_path_to_sentences: str, file_path_to_target_sentences: str) -> list: """Uses the sentences found at file_path_to_target_sentences to classify the sentences found at file_path_to_sentences. Returns a list of (sentence, predicted-label) tuples.""" # Read the sentences-to-predict from file and map them into the appropriate format list_of_sentences = file_augmentation.get_sentences_from_csv_file( file_path_to_sentences, sentences_only=True) # Now use the other KNN method return knn_from_file(k, list_of_sentences, file_path_to_target_sentences)
def neural_net_from_file(train_data_path: str, predict_data_path: str, num_epochs=20) -> list: """Uses the data found at train_data_path to classify the data found at predict_data_path with a neural network. Returns a list of (sentence, predicted-label) tuples.""" # Read the data and process it into a good format data_pair = read_file_and_convert_to_model_data(train_data_path) training_data, training_labels = data_pair # Build and train the neural network model = build_model() trained_model = train_model(training_data, training_labels, num_epochs, model) # Get the sentences that we want to classify and do so list_of_sentences = file_augmentation.get_sentences_from_csv_file(predict_data_path, sentences_only=True) predicted_sentence_list = predict_sentence_labels(list_of_sentences, trained_model) return predicted_sentence_list
def naive_bayes_from_file(file_path_to_train_sentences: str, file_path_to_test_sentences: str, binary_classification=False, alpha=1.0, norm=False) -> list: """Uses the sentences and labels from file_path_to_train_sentences to make a Naive Bayes classifier and then classifies the sentences found at file_path_to_test_sentences and returns a list of (sentence, predicted-label) pairs.""" classifier = make_classifier_from_file_path(file_path_to_train_sentences, binary_classification, alpha, norm) test_sentences_with_labels = file_augmentation.get_sentences_from_csv_file(file_path_to_test_sentences) # Now split up the test sentences and labels list_of_sentences = list() for pair in test_sentences_with_labels: sentence = pair[0] list_of_sentences.append(sentence) # Now predict labels predictions = predict_sentence_labels(classifier, list_of_sentences, binary_classification) return predictions
def make_classifier_from_file_path(file_path_to_train_sentences: str, binary_classification=False, alpha=1.0, norm=False): """Trains a Naive Bayes classifier and returns it. Inputs: - file_path_to_train_sentences (string): the file-path to the sentences and labels to use for training, - binary_classification (boolean): whether the classifier should do Facts or Non-fact classifaction or for each label, Outputs: - A trained Naive Bayes classifier""" # Read the file with the sentences and labels sentences = file_augmentation.get_sentences_from_csv_file(file_path_to_train_sentences) # Embed those as vectors vector_sentences = tfhub_embedding.embed_sentence_list(sentences) # Now make the classifier classifier = train_naive_bayes(vector_sentences, binary_classification, alpha, norm) return classifier
def knn_from_file( k: int, list_of_sentences: list, file_path: str, ) -> list: """Uses KNN to classify each sentence in list_of_sentences using sentences and labels from the file at file_path, k is number of neighbors to check. Returns a list of (sentence, label) tuples.""" # Read the CSV file for target vectors sentence_label_pair_list = file_augmentation.get_sentences_from_csv_file( file_path) # Embed those sentences as the target vectors embedded_sentence_label_pair_list = tfhub_embedding.embed_sentence_list( sentence_label_pair_list) # Run KNN on the sentences with respect to the target vectors predicted_sentence_list = tag_sentences_via_knn( k, list_of_sentences, embedded_sentence_label_pair_list) return predicted_sentence_list
def read_file_and_convert_to_model_data(file_path: str) -> tuple: """Reads the specified CSV file and converts the sentences to 512-dimensional vectors. Returns a tuple of the form (list of sentence-as-vector, list of labels).""" # Read the file and turn it into a list sentence_label_pair_list = file_augmentation.get_sentences_from_csv_file(file_path) # Now embed the sentences as vectors vector_label_pair_list = tfhub_embedding.embed_sentence_list(sentence_label_pair_list) # Make the two lists to keep track of things vector_tensor_list = list() label_tensor_list = list() # Dictionary for classifying labels as numbers """CHANGED THIS FOR BINARY CLASSIFICATION""" label_to_num_dict = { "Facts": 0, "Issue": 1, "Rule/Law/Holding": 2, "Analysis": 3, "Conclusion": 4, "Others": 5, } # Now split the list of tuples into a tuple of lists for index,pair in enumerate(vector_label_pair_list): vector = pair[0] label = pair[1] # Turn the vector into a tensor vector_tensor = item_to_tensor(vector) # Get a number classification for the label label_tensor = label_to_num_dict.get(label) # Add them to their respective lists vector_tensor_list.append(vector_tensor[0]) # The vector tensor lives inside of a redundant list, so take it out of that label_tensor_list.append(label_tensor) # Now package the lists into a tuple after converting them to Numpy arrays and return it # Additionally, add a "fake" dimension so that the #dims = 3 for specific TensorFlow layers vector_tensor_list = np.expand_dims(np.array(vector_tensor_list), axis=1) label_tensor_list = np.expand_dims(np.array(label_tensor_list), axis=1) return (vector_tensor_list, label_tensor_list)
def evaluate_knn_with_confusion_matrix( k: int, file_path_to_validation_sentences: str, file_path_to_target_sentences: str) -> list: """Evaluates the KNN classifier by printing the confusion matrix for that classification attempt. Also returns a list of (sentence, label, predicted-label) tuples. Validation sentences: the sentences whose class you wish to predict. Target sentences: the sentences you're using as neighbors to predict classifications.""" # Make a 2-d array for the confusion matrix, 6x6 matrix filled with 0s confusion_matrix = [[0 for i in range(6)] for j in range(6)] #confusion_matrix = [[0 for i in range(2)] for j in range(2)] """CHANGED FOR BINARY""" # Two dictionaries for encoding/decoding labels and positions in lists """THIS IS WHAT WAS CHANGED FOR BINARY""" label_to_num_dict = { "Facts": 0, "Issue": 1, "Rule/Law/Holding": 2, "Analysis": 3, "Conclusion": 4, "Others": 5, } label_dict = { 0: "Facts", 1: "Issue", 2: "Rule/Law/Holding", 3: "Analysis", 4: "Conclusion", 5: "Others" } # Read the validation sentences into a list of (sentence, actual-label) pairs list_of_sentence_label_pairs = file_augmentation.get_sentences_from_csv_file( file_path_to_validation_sentences) # Now get just the sentences from that and throw them into the KNN classifier list_of_validation_sentences = list( map(lambda pair: pair[0], list_of_sentence_label_pairs.copy())) list_of_sentence_predicted_label_pairs = knn_from_file( k, list_of_validation_sentences, file_path_to_target_sentences) # Make a list to add things too return_list = list() # Counters for accuracy total_classified = 0 correct_classifications = 0 # Now put everything back together as (sentence, label, predicted label) for index, pair in enumerate(list_of_sentence_predicted_label_pairs): sentence = pair[0] label = list_of_sentence_label_pairs[index][1] predicted_label = pair[1] # Package it up nicely sentence_label_predicted_label_tuple = (sentence, label, predicted_label) return_list.append(sentence_label_predicted_label_tuple) # Finally, update our confusion matrix and our counters for accuracy label_num = label_to_num_dict.get(label) predicted_label_num = label_to_num_dict.get(predicted_label) confusion_matrix[label_num][predicted_label_num] = confusion_matrix[ label_num][predicted_label_num] + 1 # Update our counters as well total_classified += 1 if label_num is predicted_label_num: correct_classifications += 1 # Print the categories for index, label_list in enumerate(confusion_matrix): print(label_dict.get(index), end=" ") # Print the overall accuracy accuracy = correct_classifications / total_classified print('\n') print( f"Classification accuracy with k = {k}, {correct_classifications} / {total_classified}: {accuracy}" ) # Print the confusion matrix, with accuracy rates per label for index, label_list in enumerate(confusion_matrix): total_classifications_in_category = sum(label_list) correct_classifications = label_list[index] label_accuracy = correct_classifications / ( total_classifications_in_category + 0.00001 ) # Prevent zero division error print(label_list, end=" ") print( f"Accuracy for {label_dict.get(index)}, {correct_classifications} / {total_classifications_in_category} classified correctly: {label_accuracy}" ) return return_list
def evaluate_naive_bayes(train_file_path: str, validate_file_path: str, binary_classification: bool, alpha=1.0, norm=False) -> list: """Evaluates how accurate the Naive Bayes classifier is on already classified data. Returns a list of (sentence, label, predicted-label) tuples.""" # Dictionaries and lists that depend on whether the classifcation is binary or not if binary_classification is True: label_to_num_dict = { "Facts": 0, "Issue": 1, "Rule/Law/Holding": 1, "Analysis": 1, "Conclusion": 1, "Others": 1, "Non-fact": 1, } label_dict = { 0: "Facts", 1: "Non-fact" } confusion_matrix = [[0 for i in range(2)] for j in range(2)] else: label_to_num_dict = { "Facts": 0, "Issue": 1, "Rule/Law/Holding": 2, "Analysis": 3, "Conclusion": 4, "Others": 5, } label_dict = { 0: "Facts", 1: "Issue", 2: "Rule/Law/Holding", 3: "Analysis", 4: "Conclusion", 5: "Others" } confusion_matrix = [[0 for i in range(6)] for j in range(6)] # Make a classifier with the training data classifier = make_classifier_from_file_path(train_file_path, binary_classification, alpha, norm) # Get the validation sentences list_of_sentence_label_pairs = file_augmentation.get_sentences_from_csv_file(validate_file_path) list_of_sentences = list() for pair in list_of_sentence_label_pairs: sentence = pair[0] list_of_sentences.append(sentence) # Predict some labels list_of_sentence_predicted_label_pairs = predict_sentence_labels(classifier, list_of_sentences, binary_classification) return_list = list() # Now match predictions with accuracy, but first, make some variables for keeping track of things total_classified = 0 correct_classifications = 0 # Now put everything back together as (sentence, label, predicted label) for index, pair in enumerate(list_of_sentence_predicted_label_pairs): sentence = pair[0] label = list_of_sentence_label_pairs[index][1] predicted_label = pair[1] # Package it up nicely sentence_label_predicted_label_tuple = (sentence, label, predicted_label) return_list.append(sentence_label_predicted_label_tuple) # Finally, update our confusion matrix and our counters for accuracy label_num = label_to_num_dict.get(label) predicted_label_num = label_to_num_dict.get(predicted_label) confusion_matrix[label_num][predicted_label_num] = confusion_matrix[label_num][predicted_label_num] + 1 # Update our counters as well total_classified += 1 if label_num is predicted_label_num: correct_classifications +=1 # Print the categories print("\n") for index, label_list in enumerate(confusion_matrix): print(label_dict.get(index), end=" ") # Print the overall accuracy accuracy = correct_classifications / total_classified print('\n') print(f"Alpha: {alpha}") print(f"Classification accuracy: {accuracy}") # Print the confusion matrix, with accuracy rates per label for index, label_list in enumerate(confusion_matrix): total_classifications_in_category = sum(label_list) correct_classifications = label_list[index] label_accuracy = correct_classifications / (total_classifications_in_category + 0.00001) # Prevent zero division error print(label_list, end=" ") print(f"Accuracy for {label_dict.get(index)}, {correct_classifications} / {total_classifications_in_category} classified correctly: {label_accuracy}") return return_list
def evaluate_cerberus_binary(train_data_path, predict_data_path, k=5, num_epochs=20) -> list: """Runs Cerberus on the provided inputs and evaluates its performance. Returns a list of (sentence, label, predicted-label) tuples.""" # Get our sentences with labels attached and also run Ceberus list_of_sentence_label_pairs = file_augmentation.get_sentences_from_csv_file( predict_data_path) list_of_sentence_predicted_label_pairs = run_cerberus_binary( train_data_path, predict_data_path, k, num_epochs) # Make some dictionaries for use later label_to_num_dict = { "Facts": 0, "Issue": 1, "Rule/Law/Holding": 1, "Analysis": 1, "Conclusion": 1, "Others": 1, "Non-fact": 1, } label_dict = {0: "Facts", 1: "Non-fact"} confusion_matrix = [[0 for i in range(2)] for j in range(2)] return_list = list() # Now match predictions with accuracy, but first, make some variables for keeping track of things total_classified = 0 correct_classifications = 0 # Now put everything back together as (sentence, label, predicted label) for index, pair in enumerate(list_of_sentence_predicted_label_pairs): sentence = pair[0] label = list_of_sentence_label_pairs[index][1] predicted_label = pair[1] # Package it up nicely sentence_label_predicted_label_tuple = (sentence, label, predicted_label) return_list.append(sentence_label_predicted_label_tuple) # Finally, update our confusion matrix and our counters for accuracy label_num = label_to_num_dict.get(label) predicted_label_num = label_to_num_dict.get(predicted_label) confusion_matrix[label_num][predicted_label_num] = confusion_matrix[ label_num][predicted_label_num] + 1 # Update our counters as well total_classified += 1 if label_num is predicted_label_num: correct_classifications += 1 # Print the categories print("\n") for index, label_list in enumerate(confusion_matrix): print(label_dict.get(index), end=" ") # Print the overall accuracy accuracy = correct_classifications / total_classified print('\n') print( f"Classification accuracy, {correct_classifications} / {total_classified} correct: {accuracy}" ) # Print the confusion matrix, with accuracy rates per label for index, label_list in enumerate(confusion_matrix): total_classifications_in_category = sum(label_list) correct_classifications = label_list[index] label_accuracy = correct_classifications / total_classifications_in_category print(label_list, end=" ") print( f"Accuracy for {label_dict.get(index)}, {correct_classifications} / {total_classifications_in_category} classified correctly: {label_accuracy}" ) return return_list