def visualize(data): try: course1 = 'Defense Against the Dark Arts' course2 = 'Astronomy' scatter(data, course1, course2) except Exception: tools.error_exit('Failed to visualize data. Is data valid?')
def preprocess(args): # check if there are any columns with missing/null data # print(data.isnull().sum()) try: data = pd.read_csv(args.Dataset) data.columns = [ 'id', 'diagnosis', 'radius_mean', 'texture_mean', 'perimeter_mean', 'area_mean', 'smoothness_mean', 'compactness_mean', 'concavity_mean', 'concave points_mean', 'symmetry_mean', 'fractal_dimension_mean', 'radius_se', 'texture_se', 'perimeter_se', 'area_se', 'smoothness_se', 'compactness_se', 'concavity_se', 'concave points_se', 'symmetry_se', 'fractal_dimension_se', 'radius_worst', 'texture_worst', 'perimeter_worst', 'area_worst', 'smoothness_worst', 'compactness_worst', 'concavity_worst', 'concave points_worst', 'symmetry_worst', 'fractal_dimension_worst' ] data = data.drop(columns=['id']) data = data.dropna() data['diagnosis'] = data['diagnosis'].map({'M': 1, 'B': 0}) except Exception: error_exit('Failed to preprocess data. Is data valid?') data = scale(data) return data
def write_weights(weights, courses): try: weights = pd.DataFrame.from_dict(weights, columns=courses, orient='index') weights.to_csv('../data/weights.csv') print('Successfully saved weights to ../data/weights.csv') except Exception: tools.error_exit('Saving weights.csv failed')
def write_houses(predictions): try: predictions.to_csv('../data/houses.csv', index_label='Index', header=['Hogwarts House']) print('Successfully saved predicted houses to ../data/houses.csv') except Exception: tools.error_exit('Saving houses.csv failed')
def preprocess(data): try: normed, X = tools.generic_preprocess(data, 'drop') courses = list(normed.columns.values) courses[0] = 'intercept' except Exception: tools.error_exit('Failed to preprocess data. Is data valid?') return normed, courses, X
def check_names(data_path, weights_path): if ('dataset' in data_path) == False: tools.error_exit( 'Data path specified ({}) does not include dataset'.format( data_path)) if ('weights' in weights_path) == False: tools.error_exit( 'Weights path specified ({}) does not include weights'.format( weights_path))
def preprocess(data, plot_all): try: data = data.drop(columns=['Index', 'First Name', 'Last Name', 'Birthday', 'Best Hand']) if plot_all == False: data = data.drop(columns=['Arithmancy', 'Care of Magical Creatures', 'Defense Against the Dark Arts']) data = data.dropna() except Exception: tools.error_exit('Failed to preprocess data. Is data valid?') return data
def find_accuracy(true, predicted): try: if not len(true) == len(predicted): tools.error_exit('Number of true and predicted houses different') decimal = accuracy_score(true, predicted) percent = decimal * 100 except Exception: tools.error_exit('Failed to find accuracy, are you sure predictions are valid?') return round(percent, 2)
def pair_plot(feature): try: sns.pairplot(feature, hue='diagnosis', palette="husl", markers=["o", "s"], height=4) plt.show() except Exception: tools.error_exit('Failed to visualize data. Is data valid?')
def main(): try: args = get_args() data = preprocess(args) if args.visualize_data: visualize(data) sys.exit(1) train_set, test_set = split(data) num_examples = train_set.shape[0] num_features = train_set.shape[1] - 1 if args.mini_batch: batch_size = 32 # or 64 epochs = 1500 else: batch_size = num_examples epochs = 30000 nn = NeuralNetwork(num_features, batch_size, epochs) if args.train: nn.train(data, train_set, test_set, num_examples, args.quiet) if args.evaluation: y_pred = probability_to_class( nn.output.T) get_validation_metrics(y_pred[:, 0], nn.y.T[:, 0]) # mini-batch learning is noisy, so we don't plot it if not args.mini_batch: plot_learning(nn.train_losses, nn.test_losses) # save network params if args.save_model: W1, W2, W3, W4 = nn.weights1.tolist(), nn.weights2.tolist(), nn.weights3.tolist(), nn.weights4.tolist() B1, B2, B3, B4 = nn.bias1.tolist(), nn.bias2.tolist(), nn.bias3.tolist(), nn.bias4.tolist() model = dict(weights1=W1, weights2=W2, weights3=W3, weights4=W4, bias1=B1, bias2=B2, bias3=B3, bias4=B4) with open("model.json", "w") as f: json.dump(model, f, separators=(',', ':'), indent=4) if args.predict and (args.predict == "model.json"): try: with open(args.predict) as file: model = json.load(file) except: error_exit("please provide a valid model") nn.load_model(model) nn.predict(test_set, epochs) except: pass
def predict_house(data, weights): try: houses = ['Gryffindor', 'Ravenclaw', 'Slytherin', 'Hufflepuff'] _, X = tools.generic_preprocess(data, 'mean') weights = weights.drop(weights.columns[0], axis=1) students = data.loc[:, 'Hogwarts House'].to_frame() i = 0 for house in houses: theta = np.array(weights.iloc[i:i + 1]).reshape(X.shape[1], 1) p = logreg.predict(X, theta) students[house] = p i += 1 students = students.drop(columns=['Hogwarts House']) predictions = students.idxmax(axis=1) except Exception: tools.error_exit('Failed to predict houses.') return predictions
def parse_args(usage): my_parser = argparse.ArgumentParser(description=usage) my_parser.add_argument('Truth', metavar='true answers', type=str, help='the path to the true answers') my_parser.add_argument('Predicted', metavar='predicted answers', type=str, help='the path to the predicted answers') args = my_parser.parse_args() true_path = args.Truth predicted_path = args.Predicted true = tools.read_csv(true_path) predicted = tools.read_csv(predicted_path) try: true = true['Hogwarts House'] predicted = predicted['Hogwarts House'] except Exception: tools.error_exit('Failed to find house in data. Is data valid?') return true, predicted
def train(normed, X, cost): try: alpha = 0.02 num_iters = 100000 weights = {} houses = ['Gryffindor', 'Ravenclaw', 'Slytherin', 'Hufflepuff'] if cost == True: ax = tools.plot_set_up() for house in houses: y = iterate_houses(normed, house) theta = np.zeros(X.shape[1]).reshape(X.shape[1],1) theta, J_history = logreg.fit(X, y, theta, alpha, num_iters) if cost == True: tools.plot_house(J_history, house, ax) flatten = [item for array in theta for item in array] ## flattens a 2D array into 1D weights[house] = flatten if cost == True: tools.plot_show() except Exception: tools.error_exit('Failed to train weights.') return weights
def find_features(data): try: features = pd.DataFrame({ '': [ 'Count', 'Mean ', 'Std ', 'Min ', '25% ', '50% ', '75% ', 'Max ' ] }) col = 0 for column in data.columns: if col > 5: count = 0 total = 0 feature = np.array(data[column], dtype='float64') feature = np.sort(feature) min_set = 0 for value in feature: if str(value) != 'nan': count += 1 total += value if min_set == 0: minimum = value maximum = value min_set = 1 else: maximum = value mean = total / count std = std_dev(feature, mean, count) quarter = find_quart(feature, count, 1) half = find_quart(feature, count, 2) three_quarter = find_quart(feature, count, 3) features[column] = np.array([ count, mean, std, minimum, quarter, half, three_quarter, maximum ]) col += 1 except Exception: tools.error_exit('Failed to read file') return features
def visualize(data): try: sb.pairplot(data, hue='Hogwarts House', palette=['blue', 'green', 'red', 'gold'], markers = '.', height=2) plt.show() except Exception: tools.error_exit('Failed to visualize data. Is data valid?')