def visualize(data):
    try:
        course1 = 'Defense Against the Dark Arts'
        course2 = 'Astronomy'
        scatter(data, course1, course2)
    except Exception:
        tools.error_exit('Failed to visualize data. Is data valid?')
示例#2
0
def preprocess(args):

    # check if there are any columns with missing/null data
    # print(data.isnull().sum())

    try:
        data = pd.read_csv(args.Dataset)
        data.columns = [
            'id', 'diagnosis', 'radius_mean', 'texture_mean', 'perimeter_mean',
            'area_mean', 'smoothness_mean', 'compactness_mean',
            'concavity_mean', 'concave points_mean', 'symmetry_mean',
            'fractal_dimension_mean', 'radius_se', 'texture_se',
            'perimeter_se', 'area_se', 'smoothness_se', 'compactness_se',
            'concavity_se', 'concave points_se', 'symmetry_se',
            'fractal_dimension_se', 'radius_worst', 'texture_worst',
            'perimeter_worst', 'area_worst', 'smoothness_worst',
            'compactness_worst', 'concavity_worst', 'concave points_worst',
            'symmetry_worst', 'fractal_dimension_worst'
        ]
        data = data.drop(columns=['id'])
        data = data.dropna()
        data['diagnosis'] = data['diagnosis'].map({'M': 1, 'B': 0})
    except Exception:
        error_exit('Failed to preprocess data. Is data valid?')
    data = scale(data)

    return data
示例#3
0
def write_weights(weights, courses):
    try:
        weights = pd.DataFrame.from_dict(weights, columns=courses, orient='index')
        weights.to_csv('../data/weights.csv')
        print('Successfully saved weights to ../data/weights.csv')
    except Exception:
    	tools.error_exit('Saving weights.csv failed')
示例#4
0
def write_houses(predictions):
    try:
        predictions.to_csv('../data/houses.csv',
                           index_label='Index',
                           header=['Hogwarts House'])
        print('Successfully saved predicted houses to ../data/houses.csv')
    except Exception:
        tools.error_exit('Saving houses.csv failed')
示例#5
0
def preprocess(data):
    try:
        normed, X = tools.generic_preprocess(data, 'drop')
        courses = list(normed.columns.values)
        courses[0] = 'intercept'
    except Exception:
        tools.error_exit('Failed to preprocess data. Is data valid?')
    return normed, courses, X
示例#6
0
def check_names(data_path, weights_path):
    if ('dataset' in data_path) == False:
        tools.error_exit(
            'Data path specified ({}) does not include dataset'.format(
                data_path))
    if ('weights' in weights_path) == False:
        tools.error_exit(
            'Weights path specified ({}) does not include weights'.format(
                weights_path))
示例#7
0
def preprocess(data, plot_all):
    try:
        data = data.drop(columns=['Index', 'First Name', 'Last Name', 'Birthday', 'Best Hand'])
        if plot_all == False:
            data = data.drop(columns=['Arithmancy', 'Care of Magical Creatures', 'Defense Against the Dark Arts'])
        data = data.dropna()
    except Exception:
        tools.error_exit('Failed to preprocess data. Is data valid?')
    return data
示例#8
0
def find_accuracy(true, predicted):
	try:
		if not len(true) == len(predicted):
			tools.error_exit('Number of true and predicted houses different')
		decimal = accuracy_score(true, predicted)
		percent = decimal * 100
	except Exception:
		tools.error_exit('Failed to find accuracy, are you sure predictions are valid?')
	return round(percent, 2)
def pair_plot(feature):
    try:
        sns.pairplot(feature,
                     hue='diagnosis',
                     palette="husl",
                     markers=["o", "s"],
                     height=4)
        plt.show()
    except Exception:
        tools.error_exit('Failed to visualize data. Is data valid?')
示例#10
0
def main():
	
	try:	
		args = get_args()

		data = preprocess(args)

		if args.visualize_data:
			visualize(data)
			sys.exit(1) 

		train_set, test_set = split(data)
		
		num_examples = train_set.shape[0]
		num_features = train_set.shape[1] - 1
		if args.mini_batch:
			batch_size = 32		# or 64
			epochs = 1500
		else:
			batch_size = num_examples
			epochs = 30000

		nn = NeuralNetwork(num_features, batch_size, epochs)

		if args.train:
			nn.train(data, train_set, test_set, num_examples, args.quiet)

			if args.evaluation:
				y_pred = probability_to_class( nn.output.T)
				get_validation_metrics(y_pred[:, 0],  nn.y.T[:, 0])

			# mini-batch learning is noisy, so we don't plot it 
			if not args.mini_batch:
				plot_learning(nn.train_losses, nn.test_losses)

			# save network params
			if args.save_model:
				W1, W2, W3, W4 =  nn.weights1.tolist(),  nn.weights2.tolist(),  nn.weights3.tolist(),  nn.weights4.tolist()
				B1, B2, B3, B4 =  nn.bias1.tolist(),  nn.bias2.tolist(),  nn.bias3.tolist(),  nn.bias4.tolist()
				model = dict(weights1=W1, weights2=W2, weights3=W3, weights4=W4, bias1=B1, bias2=B2, bias3=B3, bias4=B4)
				with open("model.json", "w") as f:
					json.dump(model, f, separators=(',', ':'), indent=4)

		if args.predict and (args.predict == "model.json"):
			try:
				with open(args.predict) as file:
					model = json.load(file)
			except: 		
				error_exit("please provide a valid model")
			nn.load_model(model)
			nn.predict(test_set, epochs)

	except:
		pass
示例#11
0
def predict_house(data, weights):
    try:
        houses = ['Gryffindor', 'Ravenclaw', 'Slytherin', 'Hufflepuff']
        _, X = tools.generic_preprocess(data, 'mean')
        weights = weights.drop(weights.columns[0], axis=1)
        students = data.loc[:, 'Hogwarts House'].to_frame()

        i = 0
        for house in houses:
            theta = np.array(weights.iloc[i:i + 1]).reshape(X.shape[1], 1)
            p = logreg.predict(X, theta)
            students[house] = p
            i += 1

        students = students.drop(columns=['Hogwarts House'])
        predictions = students.idxmax(axis=1)
    except Exception:
        tools.error_exit('Failed to predict houses.')
    return predictions
示例#12
0
def parse_args(usage):
    my_parser = argparse.ArgumentParser(description=usage)
    my_parser.add_argument('Truth',
                       metavar='true answers',
                       type=str,
                       help='the path to the true answers')
    my_parser.add_argument('Predicted',
                       metavar='predicted answers',
                       type=str,
                       help='the path to the predicted answers')
    args = my_parser.parse_args()
    true_path = args.Truth
    predicted_path = args.Predicted
    true = tools.read_csv(true_path)
    predicted = tools.read_csv(predicted_path)
    try:
        true = true['Hogwarts House']
        predicted = predicted['Hogwarts House']
    except Exception:
        tools.error_exit('Failed to find house in data. Is data valid?')
    return true, predicted
示例#13
0
def train(normed, X, cost):
    try:
        alpha = 0.02
        num_iters = 100000
        weights = {}
        houses = ['Gryffindor', 'Ravenclaw', 'Slytherin', 'Hufflepuff']
        if cost == True:
            ax = tools.plot_set_up()
        for house in houses:
            y = iterate_houses(normed, house)
            theta = np.zeros(X.shape[1]).reshape(X.shape[1],1)
            theta, J_history = logreg.fit(X, y, theta, alpha, num_iters)
            if cost == True:
                tools.plot_house(J_history, house, ax)
            flatten = [item for array in theta for item in array] ## flattens a 2D array into 1D
            weights[house] = flatten
        if cost == True:
            tools.plot_show()
    except Exception:
        tools.error_exit('Failed to train weights.')
    return weights
示例#14
0
def find_features(data):
    try:
        features = pd.DataFrame({
            '': [
                'Count', 'Mean ', 'Std  ', 'Min  ', '25%  ', '50%  ', '75%  ',
                'Max  '
            ]
        })
        col = 0
        for column in data.columns:
            if col > 5:
                count = 0
                total = 0
                feature = np.array(data[column], dtype='float64')
                feature = np.sort(feature)
                min_set = 0
                for value in feature:
                    if str(value) != 'nan':
                        count += 1
                        total += value
                        if min_set == 0:
                            minimum = value
                            maximum = value
                            min_set = 1
                        else:
                            maximum = value
                mean = total / count
                std = std_dev(feature, mean, count)
                quarter = find_quart(feature, count, 1)
                half = find_quart(feature, count, 2)
                three_quarter = find_quart(feature, count, 3)

                features[column] = np.array([
                    count, mean, std, minimum, quarter, half, three_quarter,
                    maximum
                ])
            col += 1
    except Exception:
        tools.error_exit('Failed to read file')
    return features
示例#15
0
def visualize(data):
    try:
        sb.pairplot(data, hue='Hogwarts House', palette=['blue', 'green', 'red', 'gold'], markers = '.', height=2)
        plt.show()
    except Exception:
        tools.error_exit('Failed to visualize data. Is data valid?')