plt.plot(base_zero_false_positives, base_zero_true_positives, label='Base rate model (zeros)') plt.plot(base_random_false_positives, base_random_true_positives, label='Base rate model (random)') # plt.plot([0, 1], [0, 1], linestyle='--', label='Random guessing expectation') plt.xlabel('False Positive Rate') plt.ylabel('True Positive Rate') plt.legend() show_plot_and_save_figure('roc_auc_curve') if __name__ == '__main__': model: LogisticRegressionModel = read_pickle( GENERATED_LOGISTIC_REGRESSION_MODEL) dev_data = read_pickle(GENERATED_LR_PREPROCESSED_DEV_DATA) dev_input, dev_expected = extract_input_and_expected(dev_data) model_prediction = model.get_predictions(dev_input, args.bias_corrected) baserate_prediction = get_baserate_predictions(model_prediction) # predictions print('Evaluation preditions (int 0|1)') print('Predictions Baserate AUC: {:.2f}'.format( roc_auc_score(dev_expected, baserate_prediction))) print('Predictions Model AUC: {:.2f}'.format( roc_auc_score(dev_expected, model_prediction))) print('*' * 40) # probabilities
import random import unicodedata import pandas as pd from termcolor import colored from dataaccess.files_constants import get_wiki_batch_path, GENERATED_WIKI_PAGE_MAPPINGS_PATH from dataaccess.files_io import read_pickle from model.wiki_document import WikiDocument, WikiLine wiki_page_mapping: pd.DataFrame = read_pickle(GENERATED_WIKI_PAGE_MAPPINGS_PATH) def retrieve_wiki_page(page_id: str) -> WikiDocument: page_id = page_id.strip() # account for some special cases, like u'Beyonce\u0301' != 'Beyoncé' page_id = unicodedata.normalize('NFC', page_id) # Find correct batch file and read only relevant line batch_id, line = wiki_page_mapping.loc[page_id].values wiki_batch_path = get_wiki_batch_path(batch_id) with open(wiki_batch_path) as fp: for i, json_line in enumerate(fp): if i == line: return WikiDocument(json_line) # If this code runs, a mapping error occured print(colored('Error: Line {} not found in wiki-page {}'.format(line, batch_id), 'red'))
import argparse import pandas as pd from _4_B_fit_LR_model import fit_and_get_model, LOSS_HISTORY_FREQUENCY from dataaccess.files_constants import GENERATED_LOGISTIC_REGRESSION_MODEL, \ GENERATED_LOGISTIC_REGRESSION_LOSS_HISTORY, GENERATED_LR_PREPROCESSED_TRAINING_DATA from dataaccess.files_io import read_pickle, write_pickle from util.LR_NN_preprocessing import extract_input_and_expected from util.plots import plot_loss_values parser = argparse.ArgumentParser() parser.add_argument('--debug', help='use less data and less learning iterations', action='store_true') parser.add_argument('--num_iterations', type=int, default=100000) parser.add_argument('--learning_rate', type=float, default=0.1) args = parser.parse_args() if __name__ == '__main__': training_data: pd.DataFrame = read_pickle(GENERATED_LR_PREPROCESSED_TRAINING_DATA) train_input, train_expected = extract_input_and_expected(training_data) model, loss_values = fit_and_get_model(train_input, train_expected, args.num_iterations, args.learning_rate) write_pickle(GENERATED_LOGISTIC_REGRESSION_LOSS_HISTORY, loss_values) # for plotting write_pickle(GENERATED_LOGISTIC_REGRESSION_MODEL, model) plot_loss_values(args.num_iterations, args.learning_rate, loss_values, LOSS_HISTORY_FREQUENCY)
def __init__(self, preprocessed_pickle_path: str): preprocessed_dataset: pd.DataFrame = read_pickle(preprocessed_pickle_path) inputs, labels = extract_input_and_expected(preprocessed_dataset) self.inputs = inputs self.labels = labels
def plot_multiple_loss_values(learning_rates: list, multiple_loss_values: list): prepare_seaborn_plots() plt.xlabel('Iterations') plt.ylabel('Cross-Entropy Loss') # Fixed n for the given trained models plt.figtext(0.68, 0.56, r'$n = {:,}$'.format(100000)) for i, values in enumerate(multiple_loss_values): label = r'$\alpha = {:,}$'.format(float(learning_rates[i])) x_axis = [i * LOSS_HISTORY_FREQUENCY for i in range(len(values))] plt.plot(x_axis, values, linewidth=2, label=label) plt.legend(loc='upper right') show_plot_and_save_figure( 'logistic_regression_loss_values_comparision.png') if __name__ == '__main__': # load the pre-computed values learning_rates = ['0.0001', '0.001', '0.01', '0.1', '1.0'] filepaths = [ './generated/logistic_regression_loss_{}.p'.format(rate) for rate in learning_rates ] loss_values = [read_pickle(path) for path in filepaths] plot_multiple_loss_values(learning_rates, loss_values)