def main(argv): input_filename_x = 'train_data.csv' input_filename_y = 'train_labels.csv' test_input_filename = 'test_data.csv' svc_model_filename = 'svc_classif.pkl' lr_model_filename = 'lr_classif.pkl' rfc_model_filename = 'rfc_classif.pkl' rfc_feat_imp_filename = 'rfc_feat_imp.png' model_comp_result_chart_filename = 'method_comp_res.png' io = lib.io.IO() viz = lib.viz.Viz() # Read data X_o, y_o = io.read_data(input_filename_x, input_filename_y) test_x = io.read_data(test_input_filename, None) print "There are " + str(len(X_o)) + " samples in the train set." print "There are " + str(len(test_x)) + " samples in the test set." SVC_ll , SVC_a, RFC_ll, RFC_a, LR_ll, LR_a = [], [], [], [], [], [] comps = [10, 20, 50, 100, 150, 200, 264] for s in comps: print("Amount of components: %d"%s) cl = lib.cl.CL(io, viz) X = copy.deepcopy(X_o) y = copy.deepcopy(y_o) test_x = np.matrix(test_x) test_ids = range(1, len(test_x)+1) # Remove outliers X, y = cl.lof(np.matrix(X), np.matrix(y)) # Shuffle X, y = io.shuffle(X, y) # PCA X = cl.pca(np.matrix(X), components=s, filename=None).tolist() # test_x = cl.pca(np.matrix(test_x), components=s, filename=None).tolist() val_ids, val_x, val_y = io.pick_set(X, y, 726) train_ids, train_x, train_y = io.pick_set(X, y, 3200) # Train cl.lr_cl_train(train_x, train_y, filename=lr_model_filename) # Validate ll, a = cl.lr_cl_val(val_x, val_y) LR_ll.append(ll) LR_a.append(a) # Draw some results viz.cross_results(comps, LR_ll, LR_a, 'pca_cross_val.png')
def main(argv): input_filename_x = 'train_data.csv' input_filename_y = 'train_labels.csv' test_input_filename = 'test_data.csv' lr_model_filename = 'lr2_classif.pkl' model_comp_result_chart_filename = 'method_comp_res.png' io = lib.io.IO() viz = lib.viz.Viz() cl = lib.cl.CL(io, viz) # Read data X, y = io.read_data(input_filename_x, input_filename_y) test_x = io.read_data(test_input_filename, None) X_ = copy.deepcopy(X) y_ = copy.deepcopy(y) print "There are " + str(len(X)) + " samples in the train set." print "There are " + str(len(test_x)) + " samples in the test set." test_x = np.matrix(test_x) test_ids = range(1, len(test_x) + 1) # Remove outliers X, y = cl.lof(np.matrix(X), np.matrix(y)) # Shuffle X, y = io.shuffle(X, y) # PCA #X = cl.pca(np.matrix(X), 'pca_explained_variance.png').tolist() #test_x = cl.pca(np.matrix(test_x), None).tolist() # Split data to train and validation set val_ids, val_x, val_y = io.pick_set(X, y, 726) train_ids, train_x, train_y = io.pick_set(X, y, 3200) # Train # cl.lr_cl_train(train_x, train_y, filename=lr_model_filename) cl.lr_cl_load(lr_model_filename) # Validate cl.lr_cl_val(val_x, val_y) # predict pred_class, pred_proba = cl.lr_cl_pred(test_x) # Output io.write_classes('classes_lr2_result.csv', test_ids, pred_class) io.write_probabilities('probabilities_lr2_result.csv', test_ids, pred_proba)
def run(data_path, grid_search_path, ensemble_output_path, score_output_path, number_of_partitions, number_of_iterations, best_proportion, used_proportion): # Read partitioned input data data = read_partitioned_data(data_path, number_of_iterations, number_of_partitions) # Read true values from the partitioned data set true_values = get_true_values(data) # Read the grid search results as input data results = read_data(grid_search_path) # Construct the ensemble based on the results of the grid search and the # proportion parameters passed to this script ensemble = construct_ensemble(results, best_proportion, used_proportion) # Retrieve the classification results from the ensemble based on a # popularity vote predicted_values = ensemble_vote(ensemble) # Score the classification results of the ensemble against the true values result = Result() result.add_values(true_values, predicted_values) result.calculate() # Output the ensemble into the specified file write_data(ensemble_output_path, ensemble) # Output the ensemble score into the specified file write_data(score_output_path, result)
def main(argv): input_filename_x = 'train_data.csv' input_filename_y = 'train_labels.csv' test_input_filename = 'test_data.csv' io = lib.io.IO() viz = lib.viz.Viz() cl = lib.cl.CL(io, viz) # Read data X, y = io.read_data(input_filename_x, input_filename_y) test_x = io.read_data(test_input_filename, None) print "There are " + str(len(X)) + " samples in the train set." print "There are " + str(len(test_x)) + " samples in the test set." viz.label_hist(y, 'label_hist.png')
def run(input_path, output_path): # Read the results as input data results = read_data(input_path) # Retrieve the best result best_result = sorted(results, key=lambda k: k.average_f1(), reverse=True)[0] # Output the score into the specified file write_data(output_path, best_result)
def run(input_path, output_path): # Read the integrated score results scores = read_data(input_path) print(scores) # TODO: Remove # Visualise the scores as a heatmap pass # TODO # Save the heatmap to the specified destination pass # TODO
def main(argv): input_filename_x = 'train_data.csv' input_filename_y = 'train_labels.csv' test_input_filename = 'test_data.csv' lr_model_filename = 'lr2_classif.pkl' model_comp_result_chart_filename = 'method_comp_res.png' io = lib.io.IO() viz = lib.viz.Viz() cl = lib.cl.CL(io, viz) # Read data X, y = io.read_data(input_filename_x, input_filename_y) test_x = io.read_data(test_input_filename, None) X_ = copy.deepcopy(X) y_ = copy.deepcopy(y) print "There are " + str(len(X)) + " samples in the train set." print "There are " + str(len(test_x)) + " samples in the test set." test_x = np.matrix(test_x) test_ids = range(1, len(test_x) + 1) cl.lr_cl_load(lr_model_filename) # predict pred_class, pred_proba = cl.lr_cl_pred(X) viz.plot_confusion_matrix(y, pred_class, np.arange(1, 11)) viz.plot_confusion_matrix(y, pred_class, np.arange(1, 11), normalize=True, filename='confusion_matrix_norm.png')
def run(input_path): # Read the data set data = read_data(input_path) # Print header print('--------------------------------------------------') print('Descriptive analysis') print('--------------------------------------------------') # Analyse keys keys = sorted(data.keys()) print('There are ' + str(len(keys)) + ' keys in total:') print(' ' + str(keys)) print('') # Analyse subjects print('There are ' + str(len(set(data['subjects']))) + ' ' + 'unique subjects tests were performed on.') print('Their frequencies within the data set are the following:') output_frequencies(data['subjects']) print('') # Analyse Brodmann areas print('There are ' + str(len(set(data['areas']))) + ' ' + 'unique Brodmann areas used in the tests.') print('Their frequencies within the data set are the following:') output_frequencies(data['areas']) print('') # Analyse image categories print('There are ' + str(len(set(data['image_category']))) + ' unique ' + 'image categories that the images have been classified into.') print('Their frequencies within the data set are the following:') output_frequencies(data['image_category']) print('') # Analyse tests print('In total, there were ' + str(len(data['subjects'])) + ' tests ' + 'performed on each of the ' + str(len(data['image_category'])) + ' ' + 'images. For each of these test and image pairs, there is an ' + 'integer denoting the neural response in the specifed Brodmann ' + 'area of the specified patient after showing them the specified ' + 'image.') print('--------------------------------------------------')
def run(raw_input_path, output_path_recall, output_path_precision, output_path_f1, time_windows, frequency_bands): # Convert time windows to integers time_windows = [int(time_window) for time_window in time_windows] # Initialise the integrated score data dictionaries integrated_recall = {} integrated_precision = {} integrated_f1 = {} for time_window in time_windows: integrated_recall[time_window] = {} integrated_precision[time_window] = {} integrated_f1[time_window] = {} for time_window in time_windows: for frequency_band in frequency_bands: integrated_recall[time_window][frequency_band] = None integrated_precision[time_window][frequency_band] = None integrated_f1[time_window][frequency_band] = None # Read F1-scores from the input files into the integrated data dictionary # Iterate through each time window and frequency band pair for time_window in time_windows: for frequency_band in frequency_bands: # Construct the input file path input_path = raw_input_path.replace('TIMEWINDOW', str(time_window))\ .replace('FREQUENCYBAND', frequency_band) # Read the input file input_data = read_data(input_path) # Add the F1-score received from the data into the integrated data # dictionary integrated_recall[time_window][ frequency_band] = input_data.average_recall() integrated_precision[time_window][ frequency_band] = input_data.average_precision() integrated_f1[time_window][frequency_band] = input_data.average_f1( ) # Output the integrated scores into the specified files write_data(output_path_recall, integrated_recall) write_data(output_path_precision, integrated_precision) write_data(output_path_f1, integrated_f1)
def main(argv): X, y = [], [] io = lib.io.IO() # Read data X, y = io.read_data() X,y = numpy.matrix(X), numpy.matrix(y).T print X.shape print y.shape sys.exit() plt.n, bins, patches = plt.hist(errors) plt.ylabel('Frequency') plt.xlabel('MSE') # Save figure plt.savefig('xx.png')
def run(input_path, output_path, classes): # Read the input data set from the specified input path input_data = read_data(input_path) # Change the list of classes to a set classes = set(classes) # Construct the output data set, filtering to only have the selected classes output_data = { 'subjects': input_data['subjects'], 'areas': input_data['areas'], 'image_category': [], 'neural_responses': [] } for i in range(len(input_data['image_category'])): if input_data['image_category'][i] in classes: for field in ['image_category', 'neural_responses']: output_data[field].append(input_data[field][i]) # Write the output data set to the specified output path write_data(output_path, output_data)
def main(argv): input_filename_x = 'train_data.csv' input_filename_y = 'train_labels.csv' test_input_filename = 'test_data.csv' svc_model_filename = 'svc_classif.pkl' # lr_model_filename = 'lr_classif.pkl' lr_model_filename = 'classif.pkl' rfc_model_filename = 'rfc_classif.pkl' rfc_feat_imp_filename = 'rfc_feat_imp.png' model_comp_result_chart_filename = 'method_comp_res.png' io = lib.io.IO() viz = lib.viz.Viz() cl = lib.cl.CL(io, viz) # Read data X, y = io.read_data(input_filename_x, input_filename_y) test_x = io.read_data(test_input_filename, None) X_ = copy.deepcopy(X) y_ = copy.deepcopy(y) print "There are " + str(len(X)) + " samples in the train set." print "There are " + str(len(test_x)) + " samples in the test set." test_x = np.matrix(test_x) test_ids = range(1, len(test_x) + 1) # PCA etc. X = cl.pca(np.matrix(X), 'pca_explained_variance.png').tolist() # test_x = cl.pca(test_x, None).tolist() val_ids, val_x, val_y = io.pick_set(X, y, 1063) _, no_pca_val_x, no_pca_val_y = io.pick_set(X_, y_, 1063) #train_ids, train_x, train_y = io.pick_set(X, y, 4000) #_, no_pca_train_x, no_pca_train_y = io.pick_set(X_, y_, 4000) # Train # cl.svc_cl_train(train_x, train_y, filename=svc_model_filename) # cl.lr_cl_train(train_x, train_y, filename=lr_model_filename) #cl.rfc_cl_train(no_pca_train_x, no_pca_train_y, # filename=rfc_model_filename, # feat_imp_plot_filename=rfc_feat_imp_filename) cl.svc_cl_load(svc_model_filename) cl.lr_cl_load(lr_model_filename) cl.rfc_cl_load(rfc_model_filename) # validate results = {} results['SVC'] = cl.svc_cl_val(val_x, val_y) results['Linear Regression'] = cl.lr_cl_val(no_pca_val_x, no_pca_val_y) results['Random Forest Classifier'] = cl.rfc_cl_val( no_pca_val_x, no_pca_val_y) # Draw some results viz.model_comp_results(results, model_comp_result_chart_filename) # predict # pred_class, pred_proba = cl.svc_cl_pred(test_x) pred_class, pred_proba = cl.rfc_cl_pred(test_x) # pred_class, pred_proba = cl.lr_cl_pred(test_x) # Output io.write_classes('classes_sub_result.csv', test_ids, pred_class) io.write_probabilities('probabilities_sub_result.csv', test_ids, pred_proba)
def main(argv): input_filename_x = 'train_data.csv' input_filename_y = 'train_labels.csv' test_input_filename = 'test_data.csv' svc_model_filename = 'svc_classif.pkl' lr_model_filename = 'lr_classif.pkl' rfc_model_filename = 'rfc_classif.pkl' rfc_feat_imp_filename = 'rfc_feat_imp.png' model_comp_result_chart_filename = 'method_comp_res.png' io = lib.io.IO() viz = lib.viz.Viz() # Read data X_o, y_o = io.read_data(input_filename_x, input_filename_y) test_x = io.read_data(test_input_filename, None) print "There are " + str(len(X_o)) + " samples in the train set." print "There are " + str(len(test_x)) + " samples in the test set." SVC_ll, SVC_a, RFC_ll, RFC_a, LR_ll, LR_a = [], [], [], [], [], [] for s in range(50): print("Iteration %d" % s) cl = lib.cl.CL(io, viz) X = copy.deepcopy(X_o) y = copy.deepcopy(y_o) X_ = copy.deepcopy(X_o) y_ = copy.deepcopy(y_o) test_x = np.matrix(test_x) test_ids = range(1, len(test_x) + 1) # Remove outliers X, y = cl.lof(np.matrix(X), np.matrix(y)) X_, y_ = cl.lof(np.matrix(X_), np.matrix(y_)) # Shuffle X, y = io.shuffle(X, y) X_, y_ = io.shuffle(X_, y_) # PCA X = cl.pca(np.matrix(X), 'pca_explained_variance.png').tolist() test_x = cl.pca(np.matrix(test_x), None).tolist() val_ids, val_x, val_y = io.pick_set(X, y, 726) _, no_pca_val_x, no_pca_val_y = io.pick_set(X_, y_, 726) train_ids, train_x, train_y = io.pick_set(X, y, 3200) _, no_pca_train_x, no_pca_train_y = io.pick_set(X_, y_, 3200) # Train cl.svc_cl_train(train_x, train_y, filename=svc_model_filename) cl.lr_cl_train(train_x, train_y, filename=lr_model_filename) cl.rfc_cl_train(no_pca_train_x, no_pca_train_y, filename=rfc_model_filename, feat_imp_plot_filename=rfc_feat_imp_filename) # validate ll, a = cl.svc_cl_val(val_x, val_y) SVC_ll.append(ll) SVC_a.append(a) ll, a = cl.lr_cl_val(val_x, val_y) LR_ll.append(ll) LR_a.append(a) ll, a = cl.rfc_cl_val(no_pca_val_x, no_pca_val_y) RFC_ll.append(ll) RFC_a.append(a) # Draw some results results = { 'SVC': (sum(SVC_ll) / len(SVC_ll), sum(SVC_a) / len(SVC_a)), 'Logistic Regression': (sum(LR_ll) / len(LR_ll), sum(LR_a) / len(LR_a)), 'Random Forest Classifier': (sum(RFC_ll) / len(RFC_ll), sum(RFC_a) / len(RFC_a)) } viz.model_comp_results(results, model_comp_result_chart_filename) # predict pred_class, pred_proba = cl.lr_cl_pred(test_x) # Output io.write_classes('classes_sub_result.csv', test_ids, pred_class) io.write_probabilities('probabilities_sub_result.csv', test_ids, pred_proba)
options, args = parser.parse_args() input_filename_x = 'train_data.csv' input_filename_y = 'train_labels.csv' test_input_filename = 'test_data.csv' model_filename = 'model.yaml' weights_filename = 'weights.h5' io = lib.io.IO() viz = lib.viz.Viz() cl = lib.cl.CL(io, viz) # Read data print "Reading train data..." X, y = io.read_data(input_filename_x, input_filename_y) y = io.shift_v(y, shift=-1) print "Reading test data..." test_x = io.read_data(test_input_filename, None) print "There are " + str(len(X)) + " samples in the train set." print "There are " + str(len(test_x)) + " samples in the test set." test_x = np.matrix(test_x) test_ids = range(1, len(test_x) + 1) # load from file if options.load_path is not None: model = load_model(options.load_path + '/' + model_filename, options.load_path + '/' + weights_filename)
def main(argv): input_filename_x = 'train_data.csv' input_filename_y = 'train_labels.csv' test_input_filename = 'test_data.csv' svc_model_filename = 'svc_classif.pkl' lr_model_filename = 'lr_classif.pkl' rfc_model_filename = 'rfc_classif.pkl' rfc_feat_imp_filename = 'rfc_feat_imp.png' model_comp_result_chart_filename = 'method_comp_res.png' nn_model_filename = 'kf-nn1.pkl' io = lib.io.IO() viz = lib.viz.Viz() nn = lib.nn.NN(io, viz) # Read data print "Reading train data..." X, y = io.read_data(input_filename_x, input_filename_y) y = io.shift_v(y, shift=-1) print "Reading test data..." test_x = io.read_data(test_input_filename, None) print "There are " + str(len(X)) + " samples in the train set." print "There are " + str(len(test_x)) + " samples in the test set." test_x = np.matrix(test_x) test_ids = range(1, len(test_x)+1) # Split data to train and validation set # mini_batches num_of_batches = 5 ids, batches_x, batches_y = io.split_data(X, y, num_of_batches) #val_ids, val_x, val_y = io.pick_set(X, y, 563) #train_ids, train_x, train_y = io.pick_set(X, y, 3800) # Train training_errors = [] validation_errors = [] model_sizes = [3, 9, 27, 81, 243] for nn2 in model_sizes: nn1 = nn2*2 avg_error_validation = 0 avg_error_train = 0 for batch_num in range(num_of_batches): nn.initialize(batches_x[0].shape[1], nn1=nn1, nn2=nn2) val_x = batches_x[batch_num] val_y = batches_y[batch_num] # train for train_batch_num in range(num_of_batches): if train_batch_num == batch_num: continue nn.train(batches_x[train_batch_num], batches_y[train_batch_num], val_x, val_y, training_steps=1000, plot_prefix='k-fold-') # Calculate average training error with optimal w train_error = 0 for train_batch_num in range(num_of_batches): if train_batch_num == batch_num: continue c = nn.get_cost(batches_x[train_batch_num], batches_y[train_batch_num]) train_error += (c - train_error)/(train_batch_num + 1) avg_error_train += (c - avg_error_train)/(batch_num + 1) # Validate error_validation = nn.get_cost(val_x, val_y) avg_error_validation += (error_validation - avg_error_validation)/(batch_num+1) # Output print 'Batch ' + str(batch_num)+' validation error: '+str(error_validation) print 'AVG validation error after validation batch: ' + str(batch_num)+': '+str(avg_error_validation) print ' ' print '-----' validation_errors.append(avg_error_validation) training_errors.append(avg_error_train) nn.save_nn(nn_model_filename) # Draw some results # viz.model_comp_results(results, model_comp_result_chart_filename) viz.model_size_comp(model_sizes, validation_errors, training_errors, 'nn1_model_size_comp.png')
def run(input_path, output_path, cv_amount, use_even_distribution): # Read the data set data = read_data(input_path) # Find the number of images in the data set number_of_images = len(data['image_category']) # Find all image classes in the data set classes = sorted(set(data['image_category'])) # Initialise the list of partitioned indices partitioned_indices = [[] for i in range(cv_amount)] # If even distribution is set to be used, partition data within each class # separately and merge the resulting partitions into the partitioned # indices list, so the image class distribution in each partition would be # roughly the same. if use_even_distribution: # Construct a list of image indices corresponding to each image class indices = {} for image_class in classes: indices[image_class] = [] for i in range(number_of_images): indices[data['image_category'][i]].append(i) # Randomly split each of these lists into k nearly equal parts, and # merge them by partitions for image_class in classes: # Partition the indices list for the current image class into k # nearly equal parts partitions_list = partition_list(indices[image_class], cv_amount) # Shuffle the partition list to ensure that cumulative partitions # after merging by partitions are roughly of equal size shuffle(partitions_list) # Merge the partitioned indices list for the current image class # into the general partitioned indices list by partitions for i in range(cv_amount): partitioned_indices[i] += partitions_list[i] # If even distribution is not set to be used, partition data randomly. else: # Partition the indices list into k nearly equal parts partitioned_indices = partition_list(range(number_of_images), cv_amount) # Sort all of the partitions for partition in partitioned_indices: partition.sort() # Partition data partitions = [] for i in range(cv_amount): partitions.append({ 'subjects': data['subjects'], 'areas': data['areas'], 'image_category': [data['image_category'][j] for j in partitioned_indices[i]], 'neural_responses': [data['neural_responses'][j] for j in partitioned_indices[i]] }) # Save partitioned data for i in range(cv_amount): write_data(add_suffix_to_path(output_path, '-', i + 1), partitions[i])
def main(argv): input_filename_x = 'train_data.csv' input_filename_y = 'train_labels.csv' test_input_filename = 'test_data.csv' svc_model_filename = 'svc_classif.pkl' lr_model_filename = 'lr_classif.pkl' rfc_model_filename = 'rfc_classif.pkl' rfc_feat_imp_filename = 'rfc_feat_imp.png' model_comp_result_chart_filename = 'method_comp_res.png' nn_model_filename = 'nn1.pkl' io = lib.io.IO() viz = lib.viz.Viz() nn = lib.nn.NN(io, viz) cl = lib.cl.CL(io, viz) # Read data print "Reading train data..." X, y = io.read_data(input_filename_x, input_filename_y) y = io.shift_v(y, shift=-1) print "Reading test data..." test_x = io.read_data(test_input_filename, None) print "There are " + str(len(X)) + " samples in the train set." print "There are " + str(len(test_x)) + " samples in the test set." test_x = np.matrix(test_x) test_ids = range(1, len(test_x) + 1) # PCA etc. X = cl.pca(np.matrix(X), 'pca_explained_variance.png').tolist() test_x = cl.pca(test_x, None).tolist() # Split data to train and validation set # mini_batches #ids, batches_x, batches_y = io.split_data(X, y, 100, 100) val_ids, val_x, val_y = io.pick_set(X, y, 563) train_ids, train_x, train_y = io.pick_set(X, y, 3800) nn.initialize(train_x.shape[1], nn1=18, nn2=9, alpha=0.01) #, filename=nn_model_filename) # Train pred, proba, acc = nn.predict(train_x, train_y) print("Train set classification accuray before training: %.4f" % acc) nn.train(train_x, train_y, val_x, val_y, training_steps=100000) nn.save_nn(nn_model_filename) # validate pred, proba, acc = nn.predict(train_x, train_y) print("Train set classification accuray after training: %.4f" % acc) pred, proba, acc = nn.predict(val_x, val_y) print("Validation set classification accuray after training: %.4f" % acc) # Draw some results # viz.model_comp_results(results, model_comp_result_chart_filename) # predict pred_class, pred_proba, _ = nn.predict(test_x) pred_class = io.shift_v(pred_class, shift=1) # Output io.write_classes('nn_classes_sub_result.csv', test_ids, pred_class) io.write_probabilities('nn_probabilities_sub_result.csv', test_ids, pred_proba)