class Sketch(object): def __init__(self): self.debug = Debugger() self.x = [] self.y = [] def class_name(self): return 'Sketch' def add_x(self, x): if type(x) == list: for v in x: self.x.append(v) self.debug.prn(self, 'Added x-vals.') elif type(x) == int or type(x) == np.float64: self.x.append(x) self.debug.prn(self, 'Added x-val.') else: self.debug.prn(self, 'Incorrect type passed to add_x()', 1) print(type(x)) def get_x(self): return self.x def add_y(self, y): for v in y: self.y.append(v) self.debug.prn(self, 'Added y-vals.') def get_y(self): return self.y def add(self, c): for x,y in c: self.x.append(x) self.y.append(y) self.debug.prn(self, 'Added coords.') def plot(self): self.debug.prn(self, 'Cannot call plot() on abstract Sketch.', 1)
class Plotter(object): def __init__(self): self.debug = Debugger() self._set_axes() self._set_labels() self.sketches = [] self.filename = g.files['plot'] self.debug.prn(self, 'Plotter object created.') def class_name(self): return 'Plotter' def _set_axes(self): plt.axhline(0, color=g.x_clr) plt.axvline(0, color=g.y_clr) self.debug.prn(self, 'Axes drawn.', 3) def set_x_axis_label(self, x_label): plt.xlabel(x_label) def set_y_axis_label(self, y_label): plt.ylabel(y_label) def set_axis_labels(self, x_label, y_label): plt.xlabel(x_label) plt.ylabel(y_label) def _set_labels(self): plt.xlabel(g.x_lbl) plt.ylabel(g.y_lbl) self.debug.prn(self, 'Labels set.', 3) def set_output_filename(self, filename): self.filename = filename self.debug.prn(self, 'Filename set.') def set_title(self, title): plt.title(title) self.debug.prn(self, 'Title set.') def get_sketches(self): return self.sketches def load(self, sketches): if type(sketches) == list: for sketch in sketches: self.sketches.append(sketch) self.debug.prn(self, 'Sketches loaded') elif issubclass(type(sketches), Sketch): self.sketches.append(sketches) self.debug.prn(self, 'Sketch loaded.') else: self.debug.prn(self, 'load() takes either a Sketch or a list', 1) def _plot(self): for sketch in self.sketches: sketch.plot() def show(self): self._plot() plt.show() def save(self): self._plot() plt.savefig(self.filename) def close(self): plt.close() self.debug.prn(self, 'Plot closed.')
class Analyzer(object): def __init__(self): self.debug = Debugger() def class_name(self): return "Analyzer" def get_confusion_matrix(self, model, threshold): # model --> Model ds = model.get_dataset() y_true = ds.get_output_col() # --> [5,3,8,1,6,0] = y_true y_pred = [] for x in ds.get_input_cols(): y_pred.append(model.get_f()(x)) # f(x) # vs. y tp, tn, fp, fn = [] for y_p, y_t in zip(y_pred, y_true): if y_t > threshold: if y_p > threshold: tp += 1 else: fn += 1 else: if y_p > threshold: fp += 1 else: tn += 1 return [[tp, fp], [fn, tn]] def get_tp(self, model, threshold): return self.get_confusion_matrix(model, threshold)[0][0] def get_fp(self, model, threshold): return self.get_confusion_matrix(model, threshold)[1][0] def get_fn(self, model, threshold): return self.get_confusion_matrix(model, threshold)[0][1] def get_tn(self, model, threshold): return self.get_confusion_matrix(model, threshold)[1][1] def get_specificity(self, model, threshold): # Harry # https://en.wikipedia.org/wiki/Sensitivity_and_specificity tn = self.get_tn() fp = self.get_fp() return (tn / (tn + fp)) def get_sensitivity(self): # Harry tp = self.get_tp() fn = self.get_fn() return (tp / (tp + fn)) def get_precision(self): # Harry tp = self.get_tp() fp = self.get_fp() return (tp / (tp + fp)) def get_recall(self): # Harry tp = self.get_tp() fn = self.get_fn() return (tp / (tp + fn)) def get_accuracy(self): # Harry tp = self.get_tp() tn = self.get_tn() fp = self.get_fp() fn = self.get_fn() return ((tp + fn) / (tp + tn + fp + fn)) def get_fallout(self): # Harry fp = self.get_fp() tn = self.get_tn() return (fp / (fp + tn)) def get_bias(self): # Harry pass def get_mean(self): # Harry pass def get_auc(self): # Harry pass def get_p_by_f_dist(self): # Harry pass def get_variance(self, model): x_vals = model.get_training_x() x_av = np.mean(x_vals) f = model.get_f() sst = 0 for x in x_vals: sst += (x - x_av)**2 return sst / len(x_vals) def get_variance_by_parts(self, f, x_vals): x_av = np.mean(x_vals) sst = 0 for x in x_vals: sst += (x - x_av)**2 return sst / len(x_vals) def get_r_sq(self, model): y_av = np.mean(model.get_training_y()) f = model.get_f() ss_res = 0 ss_tot = 0 for x, y in zip(model.get_training_x(), model.get_training_y()): ss_res += (y - f(x))**2 ss_tot += (y - y_av)**2 g.debug.prn(self, 'Variance generated.') return 1 - (ss_res / ss_tot) def plot_roc(self): plotter = Plotter() plotter.set_title('Receiver Operating Characteristic') plotter.set_axis_labels('') plotter.set_output_filename() #TODO: Fill filename plotter.close() pass # Save the image as "roc.png" def get_ss_res(self, coords, f): ss = 0 for coord in coords: ss += (coord[1] - f(coord[0]))**2 return ss def ssr_curve(self, x, y, slopes=[ 0.1, 0.2, 0.3, 0.4, 0.5, 0.75, 1.0, 1.5, 2.0, 2.5, 3.0, 4.0, 5.0, 7.5, 10.0 ]): ssrs = [] for slope in slopes: yint = (np.mean(y) - slope * np.mean(x)) ssrs.append( self.get_ss_res(zip(x, y), lambda val: slope * val + yint)) image_manager = ImageManager() plotter = Plotter() plotter.set_title('Sum of Squared Residuals') plotter.set_axis_labels('Slope Selected', 'Sum of Squared Residual') plotter.set_output_filename(g.files['ls-ssr']) ssr_plot = ScatterSketch() ssr_plot.add_x(slopes) ssr_plot.add_y(ssrs) plotter.load(ssr_plot) plotter.save() plotter.close() g.debug.prn(self, 'Drawn Sum of Squared Residuals Plot') image_manager.scale(g.files['ls-ssr'], g.files['ls-ssr'], 250) def least_squares_slope_yint_eqn(self, x, y): n = len(x) sum_x = sum(x) sum_y = sum(y) sum_xy = sum(map(lambda x, y: x * y, x, y)) sum_x_sq = sum(map(lambda x: x**2, x)) x_av = np.mean(x) y_av = np.mean(y) slope = (n * sum_xy - sum_x * sum_y) / (n * sum_x_sq - (sum_x**2)) yint = y_av - slope * x_av return slope, yint def f_dist(self, model_type, trials): plotter = Plotter() image_manager = ImageManager() plotter.set_title('F Distribution') plotter.set_axis_labels('Frequency', 'F Score') plotter.set_output_filename(g.files['least-squares-f']) histogram = HistogramSketch() for i in range(trials): x_vals = g.randomizer.random_list(g.points_to_gen, g.lower_x_bound, g.upper_x_bound) y_vals = g.randomizer.random_list(g.points_to_gen, g.lower_y_bound, g.upper_y_bound) if model_type == LinearModel: slope, yint = self.least_squares_slope_yint_eqn(x_vals, y_vals) func = lambda x: slope * x + yint else: g.debug.prn(self, 'Incompatible model type.', 1) break ss_fit = self.get_ss_res(zip(x_vals, y_vals), func) ss_mean = self.get_ss_res(zip(x_vals, y_vals), lambda x: np.mean(x_vals)) p_fit = 2 # TODO: Update for Dataframe p_mean = 1 # "" n = len(x_vals) if ss_fit == 0 or (n - p_fit) == 0 or (p_fit - p_mean) == 0: self.debug.prn(self, 'F distribution cannot divide by zero.', 1) continue numerator = (ss_mean - ss_fit) / (p_fit - p_mean) denominator = ss_fit / (n - p_fit) histogram.add_x(numerator / denominator) histogram.set_bins() plotter.load(histogram) plotter.save() plotter.close() image_manager.scale(g.files['least-squares-f'], g.files['least-squares-f'], 250) self.debug.prn(self, 'F distribution created.')
class SQLConnection(object): def __init__(self, addr): self.addr = addr self.conn = sql.connect(addr) self.crsr = self.conn.cursor() self.debug = Debugger() self.debug.prn(self, 'SQLConnection object created.') self.debug.prn(self, 'Connection established.') def class_name(self): return 'SQLConnection' def get_addr(self): return self.addr def set_addr(self, addr): self.addr = addr def get_conn(self): return self.conn def get_crsr(self): return self.crsr def queue(self, sql_code): self.crsr.execute(sql_code) self.debug.prn(self, 'Queued command.') def queue_script(self, script): self.crsr.executescript(script) self.debug.prn(self, 'Queued script.') def queue_for_all(self, sql_code, data): self.crsr.executemany(sql_code, data) self.debug.prn(self, 'Queued command for all.') def commit(self): self.conn.commit() self.debug.prn(self, 'Committed queue.') def close(self): self.conn.close() self.debug.prn(self, 'Connection terminated.') def fetch(self, select_code=None): if select_code != None: self.queue(select_code) self.debug.prn(self, 'Data fetched.') return self.crsr.fetchall()
class Analyzer(object): def __init__(self): self.debug = Debugger() def class_name(self): return "Analyzer" def get_confusion_matrix(self): # Harry pass def get_specificity(self): # Harry pass def get_sensitivity(self): # Harry pass def get_precision(self): # Harry pass def get_recall(self): # Harry pass def get_accuracy(self): # Harry pass def get_fallout(self): # Harry pass def get_bias(self): # Harry pass def get_mean(self): # Harry pass def get_auc(self): # Harry pass def get_p_by_f_dist(self): # Harry pass def get_variance(self, coords, f): return self.get_ss_res(coords, f) / len(coords[0]) def get_r_sq(self, model): f_mean = lambda x : np.average(model.training_x) x = model.get_training_x() y = model.get_training_y() f = model.get_f() var_mean = self.get_variance(zip(x, y), f_mean) var_fit = self.get_variance(zip(x, y), f) g.debug.prn(self, 'R squared calculated.') return (var_mean - var_fit) / var_mean def plot_roc(self): plotter = Plotter() plotter.set_title('Receiver Operating Characteristic') plotter.set_axis_labels('') plotter.close() pass # Save the image as "roc.png" def get_ss_res(self, coords, f): ss = 0 for coord in coords: ss += (coord[1] - f(coord[0])) ** 2 return ss def ssr_curve(self, plotter_func, slopes): # TODO: Sum of squared residuals plot # ssr.png pass def least_squares_slope_yint_eqn(self, x, y): n = len(x) sum_x = sum(x) sum_y = sum(y) sum_xy = sum(map(lambda x,y : x * y, x, y)) sum_x_sq = sum(map(lambda x : x ** 2, x)) x_av = np.mean(x) y_av = np.mean(y) slope = (n * sum_xy - sum_x * sum_y) / (n * sum_x_sq - (sum_x ** 2)) yint = y_av - slope * x_av return slope, yint def f_dist(self, model_type, trials): plotter = Plotter() image_manager = ImageManager() plotter.set_title('F Distribution') plotter.set_axis_labels('Frequency', 'F Score') plotter.set_output_filename('imgs/f.png') histogram = HistogramSketch() for i in range(trials): x_vals = g.randomizer.random_list(g.points_to_gen, g.lower_x_bound, g.upper_x_bound) y_vals = g.randomizer.random_list(g.points_to_gen, g.lower_y_bound, g.upper_y_bound) if model_type == LinearModel: slope, yint = self.least_squares_slope_yint_eqn(x_vals, y_vals) func = lambda x : slope * x + yint else: g.debug.prn(self, 'Incompatible model type.', 1) break ss_fit = self.get_ss_res(zip(x_vals, y_vals), func) ss_mean = self.get_ss_res(zip(x_vals, y_vals), lambda x : np.mean(x_vals)) p_fit = 2 # TODO: Update for Dataframe p_mean = 1 # "" n = len(x_vals) if ss_fit == 0 or (n - p_fit) == 0 or (p_fit - p_mean) == 0: self.debug.prn(self, 'F distribution cannot divide by zero.', 1) continue numerator = (ss_mean - ss_fit) / (p_fit - p_mean) denominator = ss_fit / (n - p_fit) histogram.add_x(numerator / denominator) histogram.set_bins() plotter.load(histogram) plotter.save() plotter.close() image_manager.scale('imgs/f.png', 'imgs/f.png', 250) self.debug.prn(self, 'F distribution created.')