def test(model, data, labels, test_size, data_type, method, color): train_data, test_data, train_labels, test_labels = train_test_split(data, labels, test_size=test_size) model.fit(train_data, train_labels) accuracy = model.score(test_data, test_labels) print("accuracy with original data: %0.2f " % accuracy) data_new=copy.deepcopy(data) train_data, test_data, train_labels, test_labels = train_test_split(util.shuffle_data(data_new), labels, test_size=test_size) model.fit(train_data, train_labels) accuracy = model.score(test_data, test_labels) print("accuracy with shuffled data: %0.2f " % accuracy) labels_new = copy.deepcopy(labels) train_data, test_data, train_labels, test_labels = train_test_split(data, util.shuffle_labels(labels_new), test_size=test_size) model.fit(train_data, train_labels) accuracy = model.score(test_data, test_labels) print("accuracy with shuffled labels: %0.2f " % accuracy) #train with real; test with shuffled train_data, test_data, train_labels, test_labels = train_test_split(data, labels, test_size=test_size) model.fit(train_data, train_labels) accuracy = model.score(util.shuffle_data(test_data), test_labels) print("accuracy - real training data + shuffled test data: %0.2f " % accuracy) #train with shuffled data; test with real stuff train_data, test_data, train_labels, test_labels = train_test_split(data, labels, test_size=test_size) model.fit(util.shuffle_data(train_data), train_labels) accuracy = model.score(test_data, test_labels) print("accuracy - shuffled training data + real test data: %0.2f " % accuracy)
output_dir = sys.argv[2] util.remove_old_output_files(output_dir) sys.stdout = open(util.generate_output_filename(output_dir), "w") lines = util.readFile(input_file) threegrams = create_keys(3) twograms = create_keys(2) proteins = [] accuracies = [] for line in lines: if line: print("-----------------------------------------") print(line[0], line[1], line[2], line[3]) print("start processing ", line[2], line[3]) file1 = line[0] file2 = line[1] analysis_type = line[2] protein = line[3] util.start_roc_plot(analysis_type + '-' + protein) data, labels = get_data_from_files(file1, file2) accuracy = run(data, labels, analysis_type + '-' + protein) accuracies.append(accuracy) proteins.append(protein) print("&&& &&& &&& &&& &&& &&& &&& &&& &&&") accuracy = run(util.shuffle_data(data), labels, analysis_type + '-' + protein + '_shuffle') util.save_plot(output_dir, analysis_type + '-' + protein, 'svg', 600) print("done processing ", line[2], line[3]) else: util.plot_accuracies(accuracies, proteins, analysis_type, output_dir) proteins = [] accuracies = [] print("\n\n")