def cross_validation_test(): glogger.setLoggingLevel(glogger.nothing) filename = "/home/gibson/jonask/Dropbox/Ann-Survival-Phd/Two_thirds_of_SA_1889_dataset.txt" #try: # columns = input("Which columns to include? (Do NOT forget trailing comma if only one column is used, e.g. '3,'\nAvailable columns are: 2, -4, -3, -2, -1. Just press ENTER for all columns.\n") #except SyntaxError: columns = (2, -4, -3, -2, -1) print('\nIncluding columns: ' + str(columns)) P, T = parse_file(filename, targetcols = [4, 5], inputcols = columns, ignorerows = [0], normalize = True) #remove tail censored P, T = copy_without_tailcensored(P, T) #try: # comsize = input("Number of networks to cross-validate [10]: ") #except SyntaxError: comsize = 10 print('Number of networks to cross-validate: ' + str(comsize)) #try: # netsize = input('Number of hidden nodes [3]: ') #except SyntaxError as e: if len(sys.argv) < 2: netsize = 3 else: netsize = sys.argv[1] print("Number of hidden nodes: " + str(netsize)) #try: # pop_size = input('Population size [50]: ') #except SyntaxError as e: pop_size = 50 print("Population size: " + str(pop_size)) #try: # mutation_rate = input('Please input a mutation rate (0.25): ') #except SyntaxError as e: mutation_rate = 0.25 print("Mutation rate: " + str(mutation_rate)) #try: # epochs = input("Number of generations (200): ") #except SyntaxError as e: epochs = 200 print("Epochs: " + str(epochs)) com = build_feedforward_committee(comsize, len(P[0]), netsize, 1, output_function = 'linear') #1 is the column in the target array which holds teh binary censoring information test_errors, vald_errors = train_committee(com, train_evolutionary, P, T, 1, epochs, error_function = c_index_error, population_size = pop_size, mutation_chance = mutation_rate) print('\nTest Errors, Validation Errors:') for terr, verr in zip(test_errors.values(), vald_errors.values()): print(str(terr) + ", " + str(verr)) print('\nTest average, Validation average:') print(str(sum(test_errors.values()) / len(test_errors.values())) + ', ' + str(sum(vald_errors.values()) / len(vald_errors.values())))
def experiment(net, filename, epochs): P, T = parse_file(filename, targetcols = [4], inputcols = [0, 1, 2, 3], ignorecols = [], ignorerows = [], normalize = False) #P = P[:100,:] #T = T[:100, :] try: #net = train_cox(net, (P, T), (None, None), timeslots, epochs = 500, learning_rate = 5) net = traingd(net, (P, T), (None, None), epochs = epochs, learning_rate = 0.01, block_size = 0) #net = train_evolutionary(net, (P, T), (None, None), epochs = epochs) except FloatingPointError: print('Aaawww....') outputs = net.sim(P) plot_network_weights(net) plt.figure() plt.title('Scatter plot sum square error\n' + filename) plt.xlabel('Survival time years') plt.ylabel('Network output') try: plt.scatter(T.flatten(), outputs.flatten(), c = 'g', marker = 's') plt.plot(T.flatten(), T.flatten(), 'r-') except: pass
from os import path from kalderstam.util.filehandling import parse_file, save_committee, load_committee from kalderstam.neural.network import build_feedforward_committee, build_feedforward from kalderstam.util.decorators import benchmark from kalderstam.neural.training_functions import train_committee, traingd_block, train_evolutionary import logging from kalderstam.neural.matlab_functions import plotroc, stat import matplotlib.pyplot as plt logging.basicConfig(level=logging.DEBUG) # load the training set filename = path.join(path.expanduser("~"), "Kurser/ann_FYTN06/exercise1/pima_trn.dat") inputs, targets = parse_file(filename, targetcols=8) # load the test set filename = path.join(path.expanduser("~"), "Kurser/ann_FYTN06/exercise1/pima_tst.dat") test_inputs, tst_t = parse_file(filename) test = (inputs, targets) validation = ([], []) com = load_committee("/export/home/jonask/Projects/aNeuralN/ANNs/pimatrain_rocarea84.0328358209.anncom") # Estimate on test set now # Y_test = com.sim(test_inputs) # for value in Y_test: # print value[0] Y_neg = com.update(test_inputs[68]) print Y_neg
def committee_test(): try: netsize = input('Number of hidden nodes? [1]: ') except SyntaxError as e: netsize = 1 try: comsize = input('Committee size? [1]: ') except SyntaxError as e: comsize = 1 try: pop_size = input('Population size? [100]: ') except SyntaxError as e: pop_size = 100 try: mutation_rate = input('Please input a mutation rate (0.05): ') except SyntaxError as e: mutation_rate = 0.05 filename = "/home/gibson/jonask/Dropbox/Ann-Survival-Phd/Two_thirds_of_SA_1889_dataset.txt" try: columns = input("Which columns to include? (Do NOT forget trailing comma if only one column is used, e.g. '3,'\nAvailable columns are: 2, -4, -3, -2, -1. Just press ENTER for all columns.\n") except SyntaxError: columns = (2, -4, -3, -2, -1) P, T = parse_file(filename, targetcols = [4, 5], inputcols = columns, ignorerows = [0], normalize = True) #remove tail censored try: cutoff = input('Cutoff for censored data? [9999 years]: ') except SyntaxError as e: cutoff = 9999 P, T = copy_without_censored(P, T, cutoff) #Divide into validation sets try: test_size = float(input('Size of test set (not used in training)? Input in fractions. Default is [0.0]: ')) except: test_size = 0.0 ((TP, TT), (VP, VT)) = get_validation_set(P, T, validation_size = test_size, binary_column = 1) print("Length of training set: " + str(len(TP))) print("Length of test set: " + str(len(VP))) try: epochs = input("\nNumber of generations (1): ") except SyntaxError as e: epochs = 1 com = build_feedforward_committee(comsize, len(P[0]), netsize, 1, output_function = 'linear') #1 is the column in the target array which holds the binary censoring information test_errors, vald_errors, data_sets = train_committee(com, train_evolutionary, P, T, 1, epochs, error_function = c_index_error, population_size = pop_size, mutation_chance = mutation_rate) com.set_training_sets([set[0][0] for set in data_sets]) #first 0 gives training sets, second 0 gives inputs. print('\nTest C_indices, Validation C_indices:') for terr, verr in zip(test_errors.values(), vald_errors.values()): print(str(1 / terr) + ", " + str(1 / verr)) if plt: outputs = numpy.array([[com.risk_eval(inputs)] for inputs in TP]) #Need double brackets for dimensions to be right for numpy kaplanmeier(time_array = TT[:, 0], event_array = TT[:, 1], output_array = outputs[:, 0], threshold = 0.5) train_c_index = get_C_index(TT, outputs) print("\nC-index on the training set: " + str(train_c_index)) if len(VP) > 0: outputs = numpy.array([[com.risk_eval(inputs)] for inputs in VP]) #Need double brackets for dimensions to be right for numpy test_c_index = get_C_index(VT, outputs) kaplanmeier(time_array = VT[:, 0], event_array = VT[:, 1], output_array = outputs[:, 0], threshold = 0.5) print("C-index on the test set: " + str(test_c_index)) #raw_input("\nPress enter to show plots...") plt.show() try: answer = input("\nDo you wish to print committee risk output? ['n']: ") except (SyntaxError, NameError): answer = 'n' if answer != 'n' and answer != 'no': inputs = read_data_file(filename) P, T = parse_file(filename, targetcols = [4, 5], inputcols = columns, ignorerows = [0], normalize = True) outputs = [[com.risk_eval(patient)] for patient in P] while len(inputs) > len(outputs): outputs.insert(0, ["net_output"]) print("\n") for rawline in zip(inputs, outputs): line = '' for col in rawline[0]: line += str(col) line += ',' for col in rawline[1]: line += str(col) print(line)
def com_cross(): filename = "/home/gibson/jonask/Dropbox/Ann-Survival-Phd/Two_thirds_of_SA_1889_dataset.txt" #try: # columns = input("Which columns to include? (Do NOT forget trailing comma if only one column is used, e.g. '3,'\nAvailable columns are: 2, -4, -3, -2, -1. Just press ENTER for all columns.\n") #except SyntaxError: #if len(sys.argv) < 3: columns = (2, -4, -3, -2, -1) #else: # columns = [int(col) for col in sys.argv[2:]] print('\nIncluding columns: ' + str(columns)) P, T = parse_file(filename, targetcols = [4, 5], inputcols = columns, ignorerows = [0], normalize = True) #remove tail censored #print('\nRemoving tail censored...') #P, T = copy_without_censored(P, T) #Divide into validation sets #test_size = 0.33 #print('Size of test set (not used in training): ' + str(test_size)) #((TP, TT), (VP, VT)) = get_validation_set(P, T, validation_size = test_size, binary_column = 1) print("\nData set:") print("Number of patients with events: " + str(T[:, 1].sum())) print("Number of censored patients: " + str((1 - T[:, 1]).sum())) #print("Length of training set: " + str(len(TP))) #print("Length of test set: " + str(len(VP))) #try: # comsize = input("Number of networks to cross-validate [10]: ") #except SyntaxError: if len(sys.argv) < 2: netsize = 1 else: netsize = int(sys.argv[1]) print("\nNumber of hidden nodes: " + str(netsize)) comsize = 4 print('Number of members in each committee: ' + str(comsize)) comnum = 5 print('Number of committees to cross-validate: ' + str(comnum)) times_to_cross = 3 print('Number of times to repeat cross-validation: ' + str(times_to_cross)) #try: # pop_size = input('Population size [50]: ') #except SyntaxError as e: pop_size = 100 print("Population size: " + str(pop_size)) #try: # mutation_rate = input('Please input a mutation rate (0.25): ') #except SyntaxError as e: mutation_rate = 0.05 print("Mutation rate: " + str(mutation_rate)) #try: # epochs = input("Number of generations (200): ") #except SyntaxError as e: epochs = 100 print("Epochs: " + str(epochs)) for _cross_time in xrange(times_to_cross): data_sets = get_cross_validation_sets(P, T, comnum , binary_column = 1) print('\nTest Errors, Validation Errors:') for _com_num, (TS, VS) in zip(xrange(comnum), data_sets): com = build_feedforward_committee(comsize, len(P[0]), netsize, 1, output_function = 'linear') #1 is the column in the target array which holds the binary censoring information test_errors, vald_errors, internal_sets = train_committee(com, train_evolutionary, TS[0], TS[1], 1, epochs, error_function = c_index_error, population_size = pop_size, mutation_chance = mutation_rate) com.set_training_sets([set[0][0] for set in internal_sets]) #first 0 gives training sets, second 0 gives inputs. outputs = numpy.array([[com.risk_eval(inputs)] for inputs in TS[0]]) #Need double brackets for dimensions to be right for numpy train_c_index = get_C_index(TS[1], outputs) outputs = numpy.array([[com.risk_eval(inputs)] for inputs in VS[0]]) #Need double brackets for dimensions to be right for numpy val_c_index = get_C_index(VS[1], outputs) print(str(1.0 / train_c_index) + ", " + str(1.0 / val_c_index))
except FloatingPointError: print('Aaawww....') outputs = net.sim(P) c_index = get_C_index(T, outputs) logger.info("C index = " + str(c_index)) plot_network_weights(net) return net if __name__ == "__main__": logging.basicConfig(level = logging.INFO) glogger.setLoggingLevel(glogger.debug) filename = "/home/gibson/jonask/Dropbox/Ann-Survival-Phd/Two_thirds_of_SA_1889_dataset.txt" P, T = parse_file(filename, targetcols = [4, 5], inputcols = [2, -4, -3, -2, -1], ignorerows = [0], normalize = True) #P, T = parse_file(filename, targetcols = [4, 5], inputcols = [2, -3], ignorerows = [0], normalize = True) #Remove tail censored P, T = copy_without_tailcensored(P, T) #Limit to incourage overtraining! #rows = sample(range(len(T)), 100) #P = P[rows] #T = T[rows] p = len(P[0]) #number of input covariates #net = load_network('/home/gibson/jonask/Projects/aNeuralN/ANNs/4x10x10x1.ann') net = build_feedforward(p, 30, 1, output_function = 'linear')
return net if __name__ == "__main__": logging.basicConfig(level = logging.INFO) glogger.setLoggingLevel(glogger.nothing) filename = "/home/gibson/jonask/Dropbox/Ann-Survival-Phd/Two_thirds_of_SA_1889_dataset.txt" #try: # columns = input("Which columns to include? (Do NOT forget trailing comma if only one column is used, e.g. '3,'\nAvailable columns are: 2, -4, -3, -2, -1. Just press ENTER for all columns.\n") #except SyntaxError: columns = (2, -4, -3, -2, -1) print('\nIncluding columns: ' + str(columns)) P, T = parse_file(filename, targetcols = [4, 5], inputcols = columns, ignorerows = [0], normalize = True) #remove tail censored #print('\nRemoving tail censored...') #P, T = copy_without_tailcensored(P, T) try: pieces = input('Number of crossvalidation pieces? [1]: ') except SyntaxError as e: pieces = 1 #Divide into validation sets TandV = get_cross_validation_sets(P, T, pieces , binary_column = 1) for set, ((tP, tT), (vP, vT)) in zip(range(pieces), TandV): print("\nCross validation set " + str(set)) print("Training")
scatter(P[:, currentCol], P[:, currentRow + 1], ax = ax, plotSlope = False) if currentCol == currentRow: ax.set_title(headers[currentCol]) else: ax.set_title('') #ax.set_xlabel(headers[currentCol]) if currentCol == 0: ax.set_ylabel(headers[currentRow+1]) #Finish with this currentCol += 1 if __name__ == '__main__': from kalderstam.util.filehandling import parse_file filename = "/home/gibson/jonask/Dropbox/Ann-Survival-Phd/publication_data/Two_thirds_of_the_n4369_dataset_with_logs_lymf.txt" columns = ('age', 'log(1+lymfmet)', 'n_pos', 'tumsize', 'log(1+er_cyt)', 'log(1+pgr_cyt)', 'pgr_cyt_pos', 'er_cyt_pos', 'size_gt_20', 'er_cyt', 'pgr_cyt', 'time') #filename = "/home/gibson/jonask/Projects/DataMaker/hard_survival_test.txt" #columns = ('X0', 'X1', 'X2', 'X3', 'X4', 'X5', 'X6', 'X7', 'X8', 'X9') #targets = ['censtime', 'event'] #columns = ('time', 'censtime', 'noisytime', 'censnoisytime') targets = [] P, T = parse_file(filename, targetcols = targets, inputcols = columns, normalize = True, separator = '\t', use_header = True) plt.figure() scatter_all_inputs(P, columns) plt.show()
def train_single(): try: netsize = input('Number of hidden nodes? [3]: ') except SyntaxError as e: netsize = 3 try: pop_size = input('Population size? [50]: ') except SyntaxError as e: pop_size = 50 try: mutation_rate = input('Please input a mutation rate (0.25): ') except SyntaxError as e: mutation_rate = 0.25 SB22 = "/home/gibson/jonask/Dropbox/Ann-Survival-Phd/Two_thirds_of_SA_1889_dataset_SB22.txt" Benmargskohorten = "/home/gibson/jonask/Dropbox/Ann-Survival-Phd/Two_thirds_of_SA_1889_dataset_Benmargskohorten.txt" SB91b = "/home/gibson/jonask/Dropbox/Ann-Survival-Phd/Two_thirds_of_SA_1889_dataset_SB91b.txt" all_studies = "/home/gibson/jonask/Dropbox/Ann-Survival-Phd/Two_thirds_of_SA_1889_dataset.txt" #Real data print("Studies to choose from:") print("1: SB22") print("2: Benmargskohorten") print("3: SB91b") print("0: All combined (default)") try: study = input("Which study to train on? [0]: ") except SyntaxError as e: study = 0 if study == 1: filename = SB22 elif study == 2: filename = Benmargskohorten elif study == 3: filename = SB91b else: filename = all_studies try: columns = input("Which columns to include? (Do NOT forget trailing comma if only one column is used, e.g. '3,'\nAvailable columns are: 2, -4, -3, -2, -1. Just press ENTER for all columns.\n") except SyntaxError: columns = (2, -4, -3, -2, -1) #P, T = parse_file(filename, targetcols = [4, 5], inputcols = [2, -4, -3, -2, -1], ignorerows = [0], normalize = True) P, T = parse_file(filename, targetcols = [4, 5], inputcols = columns, ignorerows = [0], normalize = True) #Used for output comparison studies = {} studies[SB22] = parse_file(SB22, targetcols = [4, 5], inputcols = columns, ignorerows = [0], normalize = True) studies[Benmargskohorten] = parse_file(Benmargskohorten, targetcols = [4, 5], inputcols = columns, ignorerows = [0], normalize = True) studies[SB91b] = parse_file(SB91b, targetcols = [4, 5], inputcols = columns, ignorerows = [0], normalize = True) studies[all_studies] = parse_file(all_studies, targetcols = [4, 5], inputcols = columns, ignorerows = [0], normalize = True) #remove tail censored #P, T = copy_without_tailcensored(P, T) #Divide into validation sets #((tP, tT), (vP, vT)) = get_validation_set(P, T, validation_size = 0.25, binary_column = 1) TandV = get_cross_validation_sets(P, T, 2 , binary_column = 1) #Network part p = len(P[0]) #number of input covariates net = build_feedforward(p, netsize, 1, output_function = 'linear') #net = build_feedforward_multilayered(p, [7, 10], 1, output_function = 'linear') try: epochs = input("Number of generations (200): ") except SyntaxError as e: epochs = 200 for times, ((tP, tT), (vP, vT)) in zip(xrange(2), TandV): #train net = test(net, tP, tT, vP, vT, filename, epochs, population_size = pop_size, mutation_rate = mutation_rate) raw_input("Press enter to show plots...") glogger.show()
def cross_validation_test(): filename = "/home/gibson/jonask/Dropbox/Ann-Survival-Phd/Two_thirds_of_SA_1889_dataset.txt" #try: # columns = input("Which columns to include? (Do NOT forget trailing comma if only one column is used, e.g. '3,'\nAvailable columns are: 2, -4, -3, -2, -1. Just press ENTER for all columns.\n") #except SyntaxError: if len(sys.argv) < 3: columns = (2, -4, -3, -2, -1) else: columns = [int(col) for col in sys.argv[2:]] print('\nIncluding columns: ' + str(columns)) P, T = parse_file(filename, targetcols = [4, 5], inputcols = columns, ignorerows = [0], normalize = True) #remove tail censored #print('\nRemoving tail censored...') #P, T = copy_without_censored(P, T) print("\nData set:") print("Number of patients with events: " + str(T[:, 1].sum())) print("Number of censored patients: " + str((1 - T[:, 1]).sum())) #try: # comsize = input("Number of networks to cross-validate [10]: ") #except SyntaxError: comsize = 5 print('\nNumber of networks to cross-validate: ' + str(comsize)) times_to_cross = 3 print('\nNumber of times to repeat cross-validation: ' + str(times_to_cross)) #try: # netsize = input('Number of hidden nodes [3]: ') #except SyntaxError as e: if len(sys.argv) < 2: netsize = 1 else: netsize = int(sys.argv[1]) print("Number of hidden nodes: " + str(netsize)) #try: # pop_size = input('Population size [50]: ') #except SyntaxError as e: pop_size = 100 print("Population size: " + str(pop_size)) #try: # mutation_rate = input('Please input a mutation rate (0.25): ') #except SyntaxError as e: mutation_rate = 0.05 print("Mutation rate: " + str(mutation_rate)) #try: # epochs = input("Number of generations (200): ") #except SyntaxError as e: epochs = 400 print("Epochs: " + str(epochs)) for _ in xrange(times_to_cross): com = build_feedforward_committee(comsize, len(P[0]), netsize, 1, output_function = 'linear') #1 is the column in the target array which holds the binary censoring information test_errors, vald_errors, data_sets = train_committee(com, train_evolutionary, P, T, 1, epochs, error_function = c_index_error, population_size = pop_size, mutation_chance = mutation_rate) print('\nTest Errors, Validation Errors:') for terr, verr in zip(test_errors.values(), vald_errors.values()): print(str(terr) + ", " + str(verr))
def train_single(): try: netsize = input('Number of hidden nodes? [1]: ') except SyntaxError as e: netsize = 1 try: pop_size = input('Population size? [100]: ') except SyntaxError as e: pop_size = 100 try: mutation_rate = input('Please input a mutation rate (0.05): ') except SyntaxError as e: mutation_rate = 0.05 SB22 = "/home/gibson/jonask/Dropbox/Ann-Survival-Phd/Two_thirds_of_SA_1889_dataset_SB22.txt" Benmargskohorten = "/home/gibson/jonask/Dropbox/Ann-Survival-Phd/Two_thirds_of_SA_1889_dataset_Benmargskohorten.txt" SB91b = "/home/gibson/jonask/Dropbox/Ann-Survival-Phd/Two_thirds_of_SA_1889_dataset_SB91b.txt" all_studies = "/home/gibson/jonask/Dropbox/Ann-Survival-Phd/Two_thirds_of_SA_1889_dataset.txt" #Real data print("Studies to choose from:") print("1: SB22") print("2: Benmargskohorten") print("3: SB91b") print("0: All combined (default)") try: study = input("Which study to train on? [0]: ") except SyntaxError as e: study = 0 if study == 1: filename = SB22 elif study == 2: filename = Benmargskohorten elif study == 3: filename = SB91b else: filename = all_studies try: columns = input("Which columns to include? (Do NOT forget trailing comma if only one column is used, e.g. '3,'\nAvailable columns are: 2, -4, -3, -2, -1. Just press ENTER for all columns.\n") except SyntaxError: columns = (2, -4, -3, -2, -1) #P, T = parse_file(filename, targetcols = [4, 5], inputcols = [2, -4, -3, -2, -1], ignorerows = [0], normalize = True) P, T = parse_file(filename, targetcols = [4, 5], inputcols = columns, ignorerows = [0], normalize = True) #Used for output comparison studies = {} studies[SB22] = parse_file(SB22, targetcols = [4, 5], inputcols = columns, ignorerows = [0], normalize = True) studies[Benmargskohorten] = parse_file(Benmargskohorten, targetcols = [4, 5], inputcols = columns, ignorerows = [0], normalize = True) studies[SB91b] = parse_file(SB91b, targetcols = [4, 5], inputcols = columns, ignorerows = [0], normalize = True) studies[all_studies] = parse_file(all_studies, targetcols = [4, 5], inputcols = columns, ignorerows = [0], normalize = True) #remove tail censored try: cutoff = input('Cutoff for censored data? [9999 years]: ') except SyntaxError as e: cutoff = 9999 P, T = copy_without_censored(P, T, cutoff) #Divide into validation sets try: pieces = input('Size of validation set? Input denominator (1 for no validation set). Default is 1/[1] parts: ') except: pieces = 1 TandV = get_cross_validation_sets(P, T, pieces , binary_column = 1) #Network part p = len(P[0]) #number of input covariates net = build_feedforward(p, netsize, 1, output_function = 'linear') #net = build_feedforward_multilayered(p, [7, 10], 1, output_function = 'linear') #Initial state #outputs = net.sim(tP) #orderscatter(outputs, tT, filename, 's') try: epochs = input("Number of generations (1): ") except SyntaxError as e: epochs = 1 for ((tP, tT), (vP, vT)) in TandV: #train net = test(net, tP, tT, vP, vT, filename, epochs, population_size = pop_size, mutation_rate = mutation_rate) if plt: outputs = net.sim(tP) threshold = kaplanmeier(time_array = tT[:, 0], event_array = tT[:, 1], output_array = outputs[:, 0]) if len(vP) > 0: outputs = net.sim(vP) kaplanmeier(time_array = vT[:, 0], event_array = vT[:, 1], output_array = outputs[:, 0], threshold = threshold) print("\nThreshold dividing the training set in two equal pieces: " + str(threshold)) raw_input("\nPress enter to show plots...") plt.show() try: answer = input("Do you wish to print network output? Enter filename, or 'no' / 'n'. ['n']: ") except (SyntaxError, NameError): answer = 'n' if os.path.exists(answer): print("File exists. Will add random number to front") answer = str(random.randint(0, 123456)) + answer if answer != 'n' and answer != 'no': print_output(answer, net, filename, targetcols = [4, 5], inputcols = columns, ignorerows = [0], normalize = True)