def plot_f1_cleaning_local(data, result_name: str): results = load_result(Path(f"{data.results_path}{result_name}.p")) local_results = [x for x in results if x.get('label') is not None] cleaning_results = [x for x in local_results if x['n_errors_in_dirty'] > 0] labels = [data.column_map[c['label']] for c in cleaning_results] perf_cleaning = [round(c['error_cleaning'], 2) for c in cleaning_results] global_f1_score = [ x['global_error_cleaning'] for x in results if x.get('global_error_cleaning') is not None ][0] print(f'The run has a global f1-score on dataset {data.title} of ' f'{round(global_f1_score, 5)}') pu.figure_setup() fig_size = pu.get_fig_size(25, 5) fig = plt.figure(figsize=list(fig_size)) ax = fig.add_subplot(111) x = np.arange(len(labels)) width = 0.35 # the width of the bars rects1 = ax.bar(x, perf_cleaning, width, label='Cleaning') ax.set_ylabel('Cleaning F1-Score') ax.set_title('Performance Cleaning') ax.set_xlabel('Columns with Errors') ax.set_xticks(x) ax.set_xticklabels(labels) ax.bar_label(rects1, padding=3) return (fig, ax)
def plot_prec_threshold(data, *args): global_results, prec_thresh = list(), list() p = Path(data.results_path) for r_path in p.glob('*.p'): result = load_result(r_path) glob = list(filter(lambda x: x.get('global_error_detection'), result)) global_results.append(glob) for r_path in p.glob('*.p'): result = load_result(r_path) conf = result[0].get('precision_threshold') prec_thresh.append(conf) pu.figure_setup() fig_size = pu.get_fig_size(10, 4) fig = plt.figure(figsize=list(fig_size)) ax = fig.add_subplot(111) clean = [r[0]['global_error_cleaning'] for r in global_results] detect = [r[0]['global_error_detection'] for r in global_results] ax.scatter(prec_thresh, clean, label='Error Cleaning Complete Dataset') ax.scatter(prec_thresh, detect, label='Error Detecting Complete Dataset') ax.legend() ax.set_title( 'Effect Of Precision Threshold on Cleaning and Detection Performance') ax.set(xlabel='Precision Threshold', ylabel='F1 Score') return (fig, ax)
def plot_auc_cleaning_global(data, *args): """ Plot the trend of cleaning models over time using auc of the precision-recall curve. """ df_clean = helps.load_original_data(data, load_dirty=False) df_dirty = helps.load_original_data(data, load_dirty=True) local_results, timestamps = list(), list() p = Path(data.results_path) for r_path in p.glob('*.p'): result = load_result(r_path) local_result = list(filter(lambda x: x.get('label'), result)) ts = list( map(lambda x: x.get('run_at_timestamp'), filter(lambda x: x.get('run_at_timestamp'), result))) local_results.append(local_result) timestamps.append(ts) prec, rec = {}, {} avg_areas_under_curve_per_run, aucs = list(), list() for local_result in local_results: # for each cleaning run for r in local_result: # for each RHS aucs = list() df_clean_y_true = df_clean.loc[:, r['label']] imputer = datawig.AutoGluonImputer.load( output_path='./', model_name=r['model_checksum']) df_probas = imputer.predict(df_dirty, return_probas=True) for i in imputer.predictor.class_labels: # for each class prec[i], rec[i], _ = precision_recall_curve( df_clean_y_true == i, df_probas.loc[:, i], pos_label=True) aucs.append(auc(rec[i], prec[i])) avg_areas_under_curve_per_run.append(np.average(aucs)) pu.figure_setup() fig_size = pu.get_fig_size(10, 4) fig = plt.figure(figsize=list(fig_size)) ax = fig.add_subplot(111) ax.scatter(timestamps, avg_areas_under_curve_per_run, label='AUC Cleaning') ax.legend() ax.set(xlabel='Timestamp', ylabel='AUC Cleaning Performance') return (fig, ax)
def plot_f1_cleaning_detection_global(data, *kwargs): """ Plot the trend of cleaning models over time. """ p = Path(data.results_path) all_results = [] for r_path in p.glob('*.p'): all_results = all_results + load_result(r_path) global_results = list( filter(lambda x: x.get('global_error_detection'), all_results)) timestamps = list( map(lambda x: x.get('run_at_timestamp'), filter(lambda x: x.get('run_at_timestamp'), all_results))) detection = [x['global_error_detection'] for x in global_results] cleaning = [x['global_error_cleaning'] for x in global_results] prec_thresholds = [ x['precision_threshold'] for x in all_results if x.get('precision_threshold') is not None ] print("Plotting Datapoints:") for x in zip(detection, cleaning, timestamps, prec_thresholds): print('~~~~~') print(f'Detection performance: {round(x[0], 5)}') print(f'Cleaning performance: {round(x[1], 5)}') print(f'Precision Threshold: {x[3]}') print(f'Timestamp: {x[2]}') pu.figure_setup() fig_size = pu.get_fig_size(10, 4) fig = plt.figure(figsize=list(fig_size)) ax = fig.add_subplot(111) ax.scatter(timestamps, detection, label='F1 Error Detection') ax.scatter(timestamps, cleaning, label='F1 Data Cleaning') ax.legend() ax.set(xlabel='Timestamp', ylabel='Cleaning Performance') return (fig, ax)
def plot_f1_cleaning_detection_local(data, result_name: str): """ Plot the result stored at $data's $results_path with the name $result_name. """ results = load_result(Path(f"{data.results_path}{result_name}.p")) local_results = list(filter(lambda x: x.get('label'), results)) labels = [data.column_map[c['label']] for c in local_results] perf_error_detection = [ round(c['error_detection'], 2) for c in local_results ] perf_cleaning = [round(c['error_cleaning'], 2) for c in local_results] pu.figure_setup() fig_size = pu.get_fig_size(25, 5) fig = plt.figure(figsize=list(fig_size)) ax = fig.add_subplot(111) x = np.arange(len(labels)) width = 0.35 # the width of the bars rects1 = ax.bar(x - width / 2, perf_cleaning, width, label='Cleaning') rects2 = ax.bar(x + width / 2, perf_error_detection, width, label='Error Detection') ax.set_ylabel('F1-Score') ax.set_title('Performance Cleaning \& Error Detection') ax.set_xticks(x) ax.set_xticklabels(labels) ax.legend() ax.bar_label(rects1, padding=3) ax.bar_label(rects2, padding=3) return (fig, ax)