def test(net, P, T, vP, vT, filename, epochs, mutation_rate = 0.05, population_size = 50): logger.info("Running genetic test for: " + filename + ' ' + str(epochs)) print("\nTraining set:") print("Number of patients with events: " + str(T[:, 1].sum())) print("Number of censored patients: " + str((1 - T[:, 1]).sum())) print("\nValidation set:") if vP is not None and len(vP) > 0: print("Number of patients with events: " + str(vT[:, 1].sum())) print("Number of censored patients: " + str((1 - vT[:, 1]).sum())) else: print("Empty") outputs = net.sim(P) c_index = get_C_index(T, outputs) logger.info("C index test = " + str(c_index)) try: net = train_evolutionary(net, (P, T), (vP, vT), epochs, error_function = c_index_error, population_size = population_size, mutation_chance = mutation_rate) outputs = net.sim(P) except FloatingPointError: print('Aaawww....') outputs = net.sim(P) c_index = get_C_index(T, outputs) logger.info("C index test = " + str(c_index)) if vP is not None and len(vP) > 0: outputs = net.sim(vP) c_index = get_C_index(vT, outputs) logger.info("C index vald = " + str(c_index)) return net
def testGeneticCindexError(self): print("\nC Error") T = self.generateRandomTestData(1000) outputs = self.generateRandomTestData(1000) c_index = get_C_index(T, outputs) rand_error = c_index_error(T, outputs) / len(T) test_error = 1 / c_index print("rand_error = ", rand_error, "test value = ", test_error, "c_index = ", c_index) assert((rand_error - test_error) < 0.0001) T[:, 0] = np.arange(len(T)) outputs = T rev_outputs = outputs[::-1] c_index = get_C_index(T, outputs) ord_error = c_index_error(T, outputs) / len(T) test_error = 1 / c_index print("ordered_error = ", ord_error, "test value = ", test_error, "c_index = ", c_index) assert(ord_error == test_error) c_index = get_C_index(T, rev_outputs) rev_error = c_index_error(T, rev_outputs) / len(T) #test_error = 1 / c_index #Will give zero-division, set to 9000 test_error = 9000.0 print("reversed_error = ", rev_error, "test value = ", test_error, "c_index = ", c_index) assert(rev_error == test_error) assert(ord_error < rev_error) T[:, 0] = np.arange(len(T)) T[0, 1], T[-1, 1] = 1, 1 #Make sure they are non-censored outputs = T.copy() outputs[0], outputs[-1] = outputs[-1], outputs[0] rev_outputs = outputs[::-1] c_index = get_C_index(T, outputs) ord_error = c_index_error(T, outputs) / len(T) test_error = 1 / c_index print("1_off_error = ", ord_error, "test value = ", test_error, "c_index = ", c_index) assert(ord_error == test_error) assert(ord_error > 1) c_index = get_C_index(T, rev_outputs) rev_error = c_index_error(T, rev_outputs) / len(T) test_error = 1 / c_index print("1_off_reversed_error = ", rev_error, "test value = ", test_error, "c_index = ", c_index) assert(rev_error == test_error) assert(rev_error > 1)
def c_index_error(target, result): '''Used in genetic training. multiplied by length of target array because it is divided by the length of the target array in the genetic algorithm.''' #len(target) first to compensate for internals in genetic training #abs( - 0.5) to make both "positive" and "negative" C_index work, since they do C = get_C_index(target, result) return __inversed__(C, len(target))
def test_model_arrays(savefile, filename, P, T, **kwargs): with open(savefile, 'r') as FILE: master_com = pickle.load(FILE) print("Committee size: {0}".format(len(master_com))) output_file = 'test_{0}_{1}.cvs'.format(os.path.splitext(os.path.basename(savefile))[0], \ os.path.splitext(os.path.basename(filename))[0]) #Need double brackets for dimensions to be right for numpy outputs = numpy.array([[master_com.risk_eval(inputs)] for inputs in P]) if T is None or len(T) == 0: with open(output_file, 'w') as F: #print('Targets\tOutputs\tEvents:') F.write("Outputs\n") for o in outputs: #print("{0}\t{1}\t{2}".format(t[0], o[0], t[1])) F.write("{0}\n".format(o[0])) return outputs c_index = get_C_index(T, outputs) print("C-Index: {0}".format(c_index)) #if len(sys.argv) > 2: # thresholds = [float(t) for t in sys.argv[2:]] #else: thresholds = None #Calculate suitable size for the figure for use in LaTEX fig_width_pt = 396.0 # Get this from LaTeX using \showthe\columnwidth inches_per_pt = 1.0 / 72.27 # Convert pt to inch golden_mean = (sqrt(5) - 1.0) / 2.0 # Aesthetic ratio fig_width = fig_width_pt * inches_per_pt # width in inches fig_height = fig_width * golden_mean # height in inches fig_size = [fig_width, fig_height] #Update settings plt.rcParams['figure.figsize'] = fig_size th = kaplanmeier(time_array=T[:, 0], event_array=T[:, 1], output_array=outputs, threshold=thresholds, show_plot=False, bestcut=False, **kwargs) #print("Threshold dividing the set in two equal pieces: " + str(th)) if plt: plt.savefig('kaplanmeier_{0}_{1}.eps'.format(os.path.splitext(os.path.basename(savefile))[0], \ os.path.splitext(os.path.basename(filename))[0])) with open(output_file, 'w') as F: #print('Targets\tOutputs\tEvents:') F.write("Targets,Outputs,Events\n") for t, o in zip(T, outputs): #print("{0}\t{1}\t{2}".format(t[0], o[0], t[1])) F.write("{0},{1},{2}\n".format(t[0], o[0], t[1])) return output_file
def main(model, test_data, test_targets, column_map): print(column_map) #First establish baseline c-index out = np.array([[model.risk_eval(inputs)] for inputs in test_data]) base_cindex = get_C_index(test_targets, out) #Now we can calculate any changes. Do so now for each variable #TODO: make sure they are ordered correctly variable_changes = {} for var, i in column_map.iteritems(): print("Checking {}, {}".format(i, var)) #Make a copy of the data set so we can modify the variable temp_data = test_data.copy() #Set this variable to zero temp_data[:,i] = 1 #Generate output and calc c-index. Also increase by 100 out = np.array([[model.risk_eval(inputs)] for inputs in temp_data]) variable_changes[var] = 100*(base_cindex - get_C_index(test_targets, out)) #All variables completed. Return dictionary return variable_changes
def survival_stat(filename, thresholds = None): data = np.array(read_data_file(filename, ",")) D, t = parse_data(data, inputcols = (2, 3, 4, 5, 6, 7, 8, 9, 10), ignorerows = [0], normalize = False) T = D[:, (2, 3)] outputs = D[:, (-1, 3)] C = get_C_index(T, outputs) print("C-index: " + str(C)) print("Genetic error: " + str(1 / C)) th = kaplanmeier(D, 2, 3, -1, threshold = thresholds) print("Threshold dividing the set in two equal pieces: " + str(th)) if plt: plt.show()
def experiment(net, P, T, vP, vT, filename, epochs, learning_rate): logger.info("Running experiment for: " + filename + ' ' + str(epochs) + ", rate: " + str(learning_rate)) print("Number of patients with events: " + str(T[:, 1].sum())) print("Number of censored patients: " + str((1 - T[:, 1]).sum())) timeslots = generate_timeslots(T) try: net = traingd(net, (P, T), (vP, vT), epochs, learning_rate, block_size = 100, error_module = cox_error) except FloatingPointError: print('Aaawww....') outputs = net.sim(P) c_index = get_C_index(T, outputs) logger.info("C index = " + str(c_index)) #plot_network_weights(net) kaplanmeier(time_array = T[:, 0], event_array = T[:, 1], output_array = outputs[:, 0]) if vP is not None and len(vP) > 0: outputs = net.sim(vP) kaplanmeier(time_array = vT[:, 0], event_array = vT[:, 1], output_array = outputs[:, 0]) return net
def com_cross(): filename = "/home/gibson/jonask/Dropbox/Ann-Survival-Phd/Two_thirds_of_SA_1889_dataset.txt" #try: # columns = input("Which columns to include? (Do NOT forget trailing comma if only one column is used, e.g. '3,'\nAvailable columns are: 2, -4, -3, -2, -1. Just press ENTER for all columns.\n") #except SyntaxError: #if len(sys.argv) < 3: columns = (2, -4, -3, -2, -1) #else: # columns = [int(col) for col in sys.argv[2:]] print('\nIncluding columns: ' + str(columns)) P, T = parse_file(filename, targetcols = [4, 5], inputcols = columns, ignorerows = [0], normalize = True) #remove tail censored #print('\nRemoving tail censored...') #P, T = copy_without_censored(P, T) #Divide into validation sets #test_size = 0.33 #print('Size of test set (not used in training): ' + str(test_size)) #((TP, TT), (VP, VT)) = get_validation_set(P, T, validation_size = test_size, binary_column = 1) print("\nData set:") print("Number of patients with events: " + str(T[:, 1].sum())) print("Number of censored patients: " + str((1 - T[:, 1]).sum())) #print("Length of training set: " + str(len(TP))) #print("Length of test set: " + str(len(VP))) #try: # comsize = input("Number of networks to cross-validate [10]: ") #except SyntaxError: if len(sys.argv) < 2: netsize = 1 else: netsize = int(sys.argv[1]) print("\nNumber of hidden nodes: " + str(netsize)) comsize = 4 print('Number of members in each committee: ' + str(comsize)) comnum = 5 print('Number of committees to cross-validate: ' + str(comnum)) times_to_cross = 3 print('Number of times to repeat cross-validation: ' + str(times_to_cross)) #try: # pop_size = input('Population size [50]: ') #except SyntaxError as e: pop_size = 100 print("Population size: " + str(pop_size)) #try: # mutation_rate = input('Please input a mutation rate (0.25): ') #except SyntaxError as e: mutation_rate = 0.05 print("Mutation rate: " + str(mutation_rate)) #try: # epochs = input("Number of generations (200): ") #except SyntaxError as e: epochs = 100 print("Epochs: " + str(epochs)) for _cross_time in xrange(times_to_cross): data_sets = get_cross_validation_sets(P, T, comnum , binary_column = 1) print('\nTest Errors, Validation Errors:') for _com_num, (TS, VS) in zip(xrange(comnum), data_sets): com = build_feedforward_committee(comsize, len(P[0]), netsize, 1, output_function = 'linear') #1 is the column in the target array which holds the binary censoring information test_errors, vald_errors, internal_sets = train_committee(com, train_evolutionary, TS[0], TS[1], 1, epochs, error_function = c_index_error, population_size = pop_size, mutation_chance = mutation_rate) com.set_training_sets([set[0][0] for set in internal_sets]) #first 0 gives training sets, second 0 gives inputs. outputs = numpy.array([[com.risk_eval(inputs)] for inputs in TS[0]]) #Need double brackets for dimensions to be right for numpy train_c_index = get_C_index(TS[1], outputs) outputs = numpy.array([[com.risk_eval(inputs)] for inputs in VS[0]]) #Need double brackets for dimensions to be right for numpy val_c_index = get_C_index(VS[1], outputs) print(str(1.0 / train_c_index) + ", " + str(1.0 / val_c_index))
def train_model(filename, columns, targets, separator = '\t', comsize=1): ''' train_model(design, filename, columns, targets) Given a design, will train a committee like that on the data specified. Will save the committee as '.design_time.pcom' where design is replaced by the design and time is replaced by a string of numbers from time() Returns this filename ''' headers = [] headers.extend(columns) headers.extend(targets) #Add targets to the end targetcol = targets[0] eventcol = targets[1] savefile = ".cox_{time:.0f}.pcom".format(time = time.time()) print('\nIncluding columns: ' + str(columns)) print('Target columns: ' + str(targets)) P, T = parse_file(filename, targetcols = targets, inputcols = columns, normalize = False, separator = separator, use_header = True) #columns = (2, -6, -5, -4, -3, -2, -1) #_P, T = parse_file(filename, targetcols = [4, 5], inputcols = (2, -4, -3, -2, -1), ignorerows = [0], normalize = True) #P, _T = parse_file(filename, targetcols = [4], inputcols = columns, ignorerows = [0], normalize = True) print("\nData set:") print("Number of patients with events: " + str(T[:, 1].sum())) print("Number of censored patients: " + str((1 - T[:, 1]).sum())) print('Number of members in the committee: ' + str(comsize)) allpats = P.copy() #allpats[:, 1] = 1 #This is the event column allpats_targets = T patvals = [[] for bah in xrange(len(allpats))] cox_committee = None #Get an independant test set, 1/tau of the total. super_set = get_cross_validation_sets(P, T, 1, binary_column = 1) #For every blind test group for ((TRN, TEST), _t) in zip(super_set, xrange(len(super_set))): TRN_INPUTS = TRN[0] TRN_TARGETS = TRN[1] #TEST_INPUTS = TEST[0] #TEST_TARGETS = TEST[1] #Modulo expressions mean we can deal with any number of committees, not only multiples of three _res = 1 if comsize == 1 else 0 for com_num in xrange(int(comsize / 3) + int((comsize % 3) / 2) + _res): #Every time in the loop, create new validations sets of size 1/3. 3 everytime _tmp_val_sets = get_cross_validation_sets(TRN_INPUTS, TRN_TARGETS, 3, binary_column = 1) val_sets = [] if int(comsize / 3) > 0: _max = 3 else: _max = int((comsize % 3) / 2) * 2 + _res for _tmp_val_set in _tmp_val_sets[:_max]: ((trn_in, trn_tar), (val_in, val_tar)) = _tmp_val_set #Add target columns to the end _trn = np.append(trn_in, trn_tar, axis = 1) _val = np.append(val_in, val_tar, axis = 1) val_sets.append((_trn, _val)) #And create 3 cox models, one for each validation tmp_com = committee(val_sets, targetcol, eventcol, headers) print("Adding this many members: " + str(len(tmp_com))) if cox_committee is None: cox_committee = tmp_com else: #Extend the big committee cox_committee.members.extend(tmp_com.members) #Now what we'd like to do is get the value for each patient in the #validation set, for all validation sets. Then I'd like to average the #result for each such patient, over the different validation sets. print("Validating cox committee, this might take a little while...") _count = 0 if len(cox_committee) < 3: allpats_targets = np.empty((0, 2)) #All patients won't be in the target set in this case for pat, i in zip(allpats, xrange(len(patvals))): if _count % 50 == 0: print("{0} / {1}".format(_count, len(patvals))) _count += 1 #We could speed this up by only reading every third dataset, but I'm not sure if they are ordered correctly... for cox in cox_committee.members: (_trn, _val) = cox.internal_set trn_in = _trn[:, :-2] #Last two columns are targets val_in = _val[:, :-2] val_tar = _val[:, -2:] for valpat, valtar in zip(val_in, val_tar): if (pat == valpat).all(): #Checks each variable individually, all() does a boolean and between the results patvals[i].append(cox_committee.risk_eval(pat, cox = cox)) #Just to have something to count if len(cox_committee) < 3: allpats_targets = np.append(allpats_targets, [valtar], axis = 0) #print cox_committee.risk_eval(pat, cox = cox) break #Done with this data_set avg_vals = [] for patval in patvals: if len(patval) > 0: avg_vals.append([np.mean(patval)]) avg_vals = np.array(avg_vals) #avg_vals = np.array([[np.mean(patval)] for patval in patvals]) #Need double brackets for dimensions to fit C-module #Now we have average validation ranks. do C-index on this avg_val_c_index = get_C_index(allpats_targets, avg_vals) print('Average validation C-Index: {0}'.format(avg_val_c_index)) print('Saving committee in {0}'.format(savefile)) with open(savefile, 'w') as FILE: pickle.dump(cox_committee, FILE) return savefile
def main(design, **train_kwargs): #glogger.setLoggingLevel(glogger.debug) #FAKE filename = "/home/gibson/jonask/Projects/DataMaker/hard_survival_test_noisyindata.txt" filename_val = "/home/gibson/jonask/Projects/DataMaker/hard_survival_test_val_noisyindata.txt" #filename = "/home/gibson/jonask/Projects/DataMaker/hard_survival_test.txt" #filename_val = "/home/gibson/jonask/Projects/DataMaker/hard_survival_test_val.txt" columns = ('X0', 'X1', 'X2', 'X3', 'X4', 'X5', 'X6', 'X7', 'X8', 'X9') #columns = ('X0', 'X1', 'X2', 'X3', 'X4', 'X5') targets = ['censnoisytime', 'event'] #targets = ['censtime', 'event'] #targets = ['time', 'event1'] P, T = parse_file(filename, targetcols = targets, inputcols = columns, normalize = True, separator = '\t', use_header = True) Pval, Tval = parse_file(filename_val, targetcols = targets, inputcols = columns, normalize = True, separator = '\t', use_header = True) #-------------------------------------- #REAL #filename = "/home/gibson/jonask/Dropbox/Ann-Survival-Phd/Two_thirds_of_the_n4369_dataset_with_logs_lymf.txt" #columns = ('age', 'log(1+lymfmet)', 'n_pos', 'tumsize', 'log(1+er_cyt)', 'log(1+pgr_cyt)', 'pgr_cyt_pos', # 'er_cyt_pos', 'size_gt_20', 'er_cyt_pos', 'pgr_cyt_pos') #targets = ['time_10y', 'event_10y'] #P, T = parse_file(filename, targetcols = targets, inputcols = columns, normalize = True, separator = '\t', use_header = True) #Pval, Tval = None, None #-------------------------------------- print('\nIncluding columns: ' + str(columns)) print('Target columns: ' + str(targets)) print("\nData set:") print("Number of patients with events: " + str(T[:, 1].sum())) print("Number of censored patients: " + str((1 - T[:, 1]).sum())) for k, v in train_kwargs.iteritems(): print(str(k) + ": " + str(v)) errorfunc = c_index_error print("\nError function: " + errorfunc.__name__) print("\nDesign: " + str(design)) layers = [] hidden_func = design[-1] for layer_size in design[:-1]: layers.append(layer_size) net = build_feedforward_multilayered(input_number = len(P[0]), hidden_numbers = layers, output_number = 1, hidden_function = hidden_func, output_function = "linear") #net = build_feedforward(3, len(P[0]), netsize, 1, hidden_function = hidden_func, output_function = 'linear') #set_specific_starting_weights(net) best_net = train_evolutionary(net, (P, T), (Pval, Tval), binary_target = 1, error_function = c_index_error, **train_kwargs) cens_output = [] results = best_net.sim(P) best_net.trn_set = results[:, 0] #To get rid of extra dimensions #Now sort the set best_net.trn_set = numpy.sort(best_net.trn_set) for pat in P: cens_output.append(risk_eval(best_net, pat)) cens_output = numpy.array([[val] for val in cens_output]) #Calc C-index c_index = get_C_index(T, cens_output) print("C-Index: {0}".format(c_index))
def scatterplot_files(targetfile, targetcol, eventcol, modelfile, modeloutputcol, **kwargs): ''' scatterplot_files(targetfile, targetcol, eventcol, modelfile, modeloutputcol) Takes two files because the target data and model data is allowed to be in different files. Events are ONLY taken from target data. Writes two files: scatter_cens_targetfile_modelfile.eps scatter_nocens_targetfile_modelfile.eps ''' #Calculate suitable size for the figure for use in LaTEX fig_width_pt = 396.0 # Get this from LaTeX using \showthe\columnwidth inches_per_pt = 1.0 / 72.27 # Convert pt to inch golden_mean = (sqrt(5) - 1.0) / 2.0 # Aesthetic ratio fig_width = fig_width_pt * inches_per_pt # width in inches fig_height = fig_width * golden_mean # height in inches fig_size = [fig_width, fig_height] #Update settings plt.rcParams['figure.figsize'] = fig_size #params = {'axes.labelsize': 10, # 'text.fontsize': 10, # 'legend.fontsize': 10, # 'xtick.labelsize': 8, # 'ytick.labelsize': 8, #'text.usetex': True, # 'figure.figsize': fig_size} #plt.rcParams.update(params) # with open(targetfile, 'r') as f: # X_in = [line.split() for line in f.readlines()] # X_in = numpy.array(X_in) # X = X_in[1:, first_col] # X = numpy.array(X, dtype = 'float') data = np.array(read_data_file(targetfile, ",")) T, t = parse_data(data, inputcols=(targetcol, eventcol), ignorerows=[0], normalize=False) X = T[:, 0] events = T[:, 1] # with open(modeloutputcol, 'r') as f: # Y_in = [line.split() for line in f.readlines()] # # Y_in = numpy.array(Y_in) # Y = Y_in[1:, second_col] # Y = numpy.array(Y, dtype = 'float') data = np.array(read_data_file(modelfile, ",")) D, t = parse_data(data, inputcols=[modeloutputcol], ignorerows=[0], normalize=False) Y = D[:, 0] # if event_col is not None: # events = X_in[1:, event_col] # events = numpy.array(events, dtype = 'float') # print 'Using events' # else: # events = None # T = numpy.empty((len(X), 2), dtype='float') # T[:, 0] = X # T[:, 1] = events outputs = np.empty((len(X), 2), dtype='float') outputs[:, 0] = Y outputs[:, 1] = events c_index = get_C_index(T, outputs) print("C-Index between these files is: {0}".format(c_index)) scatter(X, Y, events=events, x_label='Targets', y_label='Model output', gridsize=30, mincnt=0, show_plot=False) #plt.xlabel(os.path.basename(sys.argv[1]) + "\nC-Index between these files is: {0}".format(c_index)) #plt.ylabel('Correlation of ' + os.path.basename(sys.argv[2])) plt.savefig('scatter_cens_cind_{cindex}_{0}_{1}.eps'.format( os.path.splitext(os.path.basename(modelfile))[0], os.path.splitext(os.path.basename(targetfile))[0], cindex=c_index)) scatter(X, Y, x_label='Targets', y_label='Model output', gridsize=30, mincnt=0, show_plot=False) #plt.xlabel(os.path.basename(sys.argv[1]) + "\nC-Index between these files is: {0}".format(c_index)) #plt.ylabel('Correlation of ' + os.path.basename(sys.argv[2])) plt.savefig('scatter_nocens_{cindex}_{0}_{1}.eps'.format( os.path.splitext(os.path.basename(modelfile))[0], os.path.splitext(os.path.basename(targetfile))[0], cindex=c_index))
def model_contest(filename, columns, targets, designs, comsize_third = 5, repeat_times = 20, testfilename = None, separator = '\t', **train_kwargs): ''' model_contest(filename, columns, targets, designs) You must use column names! Here are example values for the input arguments: filename = "/home/gibson/jonask/Dropbox/Ann-Survival-Phd/Two_thirds_of_the_n4369_dataset_with_logs_lymf.txt" columns = ('age', 'log(1+lymfmet)', 'n_pos', 'tumsize', 'log(1+er_cyt)', 'log(1+pgr_cyt)', 'pgr_cyt_pos', 'er_cyt_pos', 'size_gt_20', 'er_cyt_pos', 'pgr_cyt_pos') targets = ['time', 'event'] Writes the results to '.winningdesigns_time.csv' and returns the filename ''' starting_time = time.time() fastest_done = None m = Master() #m.connect('gibson.thep.lu.se', 'science') m.connect('130.235.189.249', 'science') print('Connected to server') m.clear_queues() print('\nIncluding columns: ' + str(columns)) print('\nTarget columns: ' + str(targets)) P, T = parse_file(filename, targetcols = targets, inputcols = columns, normalize = True, separator = separator, use_header = True) if testfilename is not None: Ptest, Ttest = parse_file(testfilename, targetcols = targets, inputcols = columns, normalize = True, separator = separator, use_header = True) else: Ptest, Ttest = None, None print("\nData set:") print("Number of patients with events: " + str(T[:, 1].sum())) print("Number of censored patients: " + str((1 - T[:, 1]).sum())) print("T:" + str(T.shape)) print("P:" + str(P.shape)) if (Ptest is not None and Ttest is not None): print("\nExternal Test Data set:") print("Number of patients with events: " + str(Ttest[:, 1].sum())) print("Number of censored patients: " + str((1 - Ttest[:, 1]).sum())) print("Ttest:" + str(Ttest.shape)) print("Ptest:" + str(Ptest.shape)) comsize = 3 * comsize_third #Make sure it is divisible by three print('\nNumber of members in each committee: ' + str(comsize)) print('Designs used in testing (size, function): ' + str(designs)) # We can generate a test set from the data set, but usually we don't want that # Leave at 1 for no test set. val_pieces = 1 print('Cross-test pieces: ' + str(val_pieces)) cross_times = repeat_times print('Number of times to repeat procedure: ' + str(cross_times)) #try: # pop_size = input('Population size [50]: ') #except SyntaxError as e: if 'population_size' not in train_kwargs: train_kwargs['population_size'] = 50 #try: # mutation_rate = input('Please input a mutation rate (0.25): ') #except SyntaxError as e: if 'mutation_chance' not in train_kwargs: train_kwargs['mutation_chance'] = 0.25 #try: # epochs = input("Number of generations (200): ") #except SyntaxError as e: if 'epochs' not in train_kwargs: train_kwargs['epochs'] = 100 for k, v in train_kwargs.iteritems(): print(str(k) + ": " + str(v)) print('\n Job status:\n') count = 0 all_counts = [] all_jobs = {} tests = {} #trn_set = {} trn_idx = {} all_best = [] all_best_com_val = [] all_best_avg_trn = [] all_best_avg_val = [] all_best_design = [] all_best_test = [] #Lambda times for _time in xrange(cross_times): #Get an independant test set, 1/tau of the total. super_set, super_indices = get_cross_validation_sets(P, T, val_pieces , binary_column = 1, return_indices = True) super_zip = zip(super_set, super_indices) all_best.append({}) all_best_com_val.append({}) all_best_avg_trn.append({}) all_best_avg_val.append({}) all_best_design.append({}) all_best_test.append({}) best = all_best[_time] best_com_val = all_best_com_val[_time] best_avg_trn = all_best_avg_trn[_time] best_avg_val = all_best_avg_val[_time] best_design = all_best_design[_time] best_test = all_best_test[_time] #For every blind test group for (((TRN, TEST), (TRN_IDX, TEST_IDX)), _t) in zip(super_zip, xrange(len(super_set))): TRN_INPUTS = TRN[0] TRN_TARGETS = TRN[1] TEST_INPUTS = TEST[0] TEST_TARGETS = TEST[1] #run each architecture design on a separate machine best[_t] = None best_com_val[_t] = 0 best_avg_trn[_t] = 0 best_avg_val[_t] = 0 best_design[_t] = None best_test[_t] = None for design in designs: count += 1 all_counts.append(count) (netsize, hidden_func) = design com = build_feedforward_committee(comsize, len(P[0]), netsize, 1, hidden_function = hidden_func, output_function = 'linear') tests[count] = (TEST_INPUTS, TEST_TARGETS) #trn_set[count] = (TRN_INPUTS, TRN_TARGETS) #print("TRN_IDX" + str(TRN_IDX)) #print("TEST_IDX" + str(TEST_IDX)) trn_idx[count] = TRN_IDX #1 is the column in the target array which holds the binary censoring information job = m.assemblejob((count, _time, _t, design), train_committee, com, train_evolutionary, TRN_INPUTS, TRN_TARGETS, binary_target = 1, error_function = c_index_error, **train_kwargs) all_jobs[count] = job m.sendjob(job[0], job[1], *job[2], **job[3]) while(count > 0): print('Remaining jobs: {0}'.format(all_counts)) if fastest_done is None: ID, RESULT = m.getresult() #Blocks fastest_done = time.time() - starting_time else: RETURNVALUE = m.get_waiting_result(2 * fastest_done) if RETURNVALUE is not None: ID, RESULT = RETURNVALUE else: print('Timed out after {0} seconds. Putting remaining jobs {1} back on the queue.\n \ You should restart the server after this session.'.format(fastest_done, all_counts)) for _c in all_counts: job = all_jobs[_c] m.sendjob(job[0], job[1], *job[2], **job[3]) continue #Jump to next iteration print('Result received! Processing...') _c, _time, _t, design = ID (com, trn_errors, vald_errors, internal_sets, internal_sets_indices) = RESULT if _c not in all_counts: print('This result [{0}] has already been processed.'.format(_c)) continue count -= 1 TEST_INPUTS, TEST_TARGETS = tests[_c] #TRN_INPUTS, TRN_TARGETS = trn_set[_c] TRN_IDX = trn_idx[_c] all_counts.remove(_c) com.set_training_sets([_set[0][0] for _set in internal_sets]) #first 0 gives training sets, second 0 gives inputs. #Now what we'd like to do is get the value for each patient in the #validation set, for all validation sets. Then I'd like to average the #result for each such patient, over the different validation sets. allpats = [] allpats.extend(internal_sets[0][0][0]) #Extend with training inputs allpats.extend(internal_sets[0][1][0]) #Extend with validation inputs allpats_targets = [] allpats_targets.extend(internal_sets[0][0][1]) #training targets allpats_targets.extend(internal_sets[0][1][1]) #validation targets allpats_targets = numpy.array(allpats_targets) patvals = [[] for bah in xrange(len(allpats))] #print(len(patvals)) #print(len(internal_sets_indices)) #1 for the validation set. Was given to the com.nets in the same type of iteration, so order is same # Will be order consistent with P and T for ((trn_in, trn_tar), (val_in, val_tar)), idx, net in zip(internal_sets, internal_sets_indices, com.nets): _C_ = -1 for valpat in val_in: _C_ += 1 i = TRN_IDX[idx[1][_C_]] pat = P[i] #print("Facit: \n" + str(valpat)) #print("_C_ = " + str(_C_)) #print("i: " + str(i)) #print("P[TRN_IDX[i]] : " + str(pat)) assert((pat == valpat).all()) patvals[i].append(com.risk_eval(pat, net = net)) #Need double brackets for dimensions to fit C-module avg_vals = numpy.array([[numpy.mean(patval)] for patval in patvals]) #Now we have average validation ranks. do C-index on this avg_val_c_index = get_C_index(T, avg_vals) trn_errors = numpy.array(trn_errors.values(), dtype = numpy.float64) ** -1 vald_errors = numpy.array(vald_errors.values(), dtype = numpy.float64) ** -1 avg_trn = numpy.mean(trn_errors) avg_val = numpy.mean(vald_errors) best = all_best[_time] best_com_val = all_best_com_val[_time] best_avg_trn = all_best_avg_trn[_time] best_avg_val = all_best_avg_val[_time] best_design = all_best_design[_time] best_test = all_best_test[_time] if avg_val_c_index > best_com_val[_t]: best[_t] = com best_com_val[_t] = avg_val_c_index best_avg_trn[_t] = avg_trn best_avg_val[_t] = avg_val best_design[_t] = design best_test[_t] = tests[_c] print('\nWinning designs') winnerfilename = '.winningdesigns_{0:.0f}.csv'.format(time.time()) with open(winnerfilename, 'w') as F: print('Average Training Perf, Average Validation Perf, Average Committee Validation Perf, Test Perf, Design:') F.write('Average Training Perf, Average Validation Perf, Average Committee Validation Perf, Test Perf, Design\n') for _time in xrange(len(all_best)): best = all_best[_time] best_com_val = all_best_com_val[_time] best_avg_trn = all_best_avg_trn[_time] best_avg_val = all_best_avg_val[_time] best_design = all_best_design[_time] best_test = all_best_test[_time] for _t in best.keys(): TEST_INPUTS, TEST_TARGETS = best_test[_t] com = best[_t] if len(TEST_INPUTS) > 0: #Need double brackets for dimensions to be right for numpy outputs = numpy.array([[com.risk_eval(inputs)] for inputs in TEST_INPUTS]) test_c_index = get_C_index(TEST_TARGETS, outputs) elif Ptest is not None and Ttest is not None: #Need double brackets for dimensions to be right for numpy outputs = numpy.array([[com.risk_eval(inputs)] for inputs in Ptest]) test_c_index = get_C_index(Ttest, outputs) else: test_c_index = 0 print('{trn}, {val}, {com_val}, {test}, {dsn}'.format(trn = best_avg_trn[_t], val = best_avg_val[_t], com_val = best_com_val[_t], test = test_c_index, dsn = best_design[_t])) F.write('{trn}, {val}, {com_val}, {test}, {dsn}\n'.format(trn = best_avg_trn[_t], val = best_avg_val[_t], com_val = best_com_val[_t], test = test_c_index, dsn = best_design[_t])) return winnerfilename
def committee_test(): try: netsize = input('Number of hidden nodes? [1]: ') except SyntaxError as e: netsize = 1 try: comsize = input('Committee size? [1]: ') except SyntaxError as e: comsize = 1 try: pop_size = input('Population size? [100]: ') except SyntaxError as e: pop_size = 100 try: mutation_rate = input('Please input a mutation rate (0.05): ') except SyntaxError as e: mutation_rate = 0.05 filename = "/home/gibson/jonask/Dropbox/Ann-Survival-Phd/Two_thirds_of_SA_1889_dataset.txt" try: columns = input("Which columns to include? (Do NOT forget trailing comma if only one column is used, e.g. '3,'\nAvailable columns are: 2, -4, -3, -2, -1. Just press ENTER for all columns.\n") except SyntaxError: columns = (2, -4, -3, -2, -1) P, T = parse_file(filename, targetcols = [4, 5], inputcols = columns, ignorerows = [0], normalize = True) #remove tail censored try: cutoff = input('Cutoff for censored data? [9999 years]: ') except SyntaxError as e: cutoff = 9999 P, T = copy_without_censored(P, T, cutoff) #Divide into validation sets try: test_size = float(input('Size of test set (not used in training)? Input in fractions. Default is [0.0]: ')) except: test_size = 0.0 ((TP, TT), (VP, VT)) = get_validation_set(P, T, validation_size = test_size, binary_column = 1) print("Length of training set: " + str(len(TP))) print("Length of test set: " + str(len(VP))) try: epochs = input("\nNumber of generations (1): ") except SyntaxError as e: epochs = 1 com = build_feedforward_committee(comsize, len(P[0]), netsize, 1, output_function = 'linear') #1 is the column in the target array which holds the binary censoring information test_errors, vald_errors, data_sets = train_committee(com, train_evolutionary, P, T, 1, epochs, error_function = c_index_error, population_size = pop_size, mutation_chance = mutation_rate) com.set_training_sets([set[0][0] for set in data_sets]) #first 0 gives training sets, second 0 gives inputs. print('\nTest C_indices, Validation C_indices:') for terr, verr in zip(test_errors.values(), vald_errors.values()): print(str(1 / terr) + ", " + str(1 / verr)) if plt: outputs = numpy.array([[com.risk_eval(inputs)] for inputs in TP]) #Need double brackets for dimensions to be right for numpy kaplanmeier(time_array = TT[:, 0], event_array = TT[:, 1], output_array = outputs[:, 0], threshold = 0.5) train_c_index = get_C_index(TT, outputs) print("\nC-index on the training set: " + str(train_c_index)) if len(VP) > 0: outputs = numpy.array([[com.risk_eval(inputs)] for inputs in VP]) #Need double brackets for dimensions to be right for numpy test_c_index = get_C_index(VT, outputs) kaplanmeier(time_array = VT[:, 0], event_array = VT[:, 1], output_array = outputs[:, 0], threshold = 0.5) print("C-index on the test set: " + str(test_c_index)) #raw_input("\nPress enter to show plots...") plt.show() try: answer = input("\nDo you wish to print committee risk output? ['n']: ") except (SyntaxError, NameError): answer = 'n' if answer != 'n' and answer != 'no': inputs = read_data_file(filename) P, T = parse_file(filename, targetcols = [4, 5], inputcols = columns, ignorerows = [0], normalize = True) outputs = [[com.risk_eval(patient)] for patient in P] while len(inputs) > len(outputs): outputs.insert(0, ["net_output"]) print("\n") for rawline in zip(inputs, outputs): line = '' for col in rawline[0]: line += str(col) line += ',' for col in rawline[1]: line += str(col) print(line)
def model_contest(filename, columns, targets, designs, comsize_third=5, repeat_times=20, testfilename=None, separator='\t', **train_kwargs): ''' model_contest(filename, columns, targets, designs) You must use column names! Here are example values for the input arguments: filename = "/home/gibson/jonask/Dropbox/Ann-Survival-Phd/Two_thirds_of_the_n4369_dataset_with_logs_lymf.txt" columns = ('age', 'log(1+lymfmet)', 'n_pos', 'tumsize', 'log(1+er_cyt)', 'log(1+pgr_cyt)', 'pgr_cyt_pos', 'er_cyt_pos', 'size_gt_20', 'er_cyt_pos', 'pgr_cyt_pos') targets = ['time', 'event'] Writes the results to '.winningdesigns_time.csv' and returns the filename ''' starting_time = time.time() fastest_done = None m = Master() #m.connect('gibson.thep.lu.se', 'science') m.connect('130.235.189.249', 'science') print('Connected to server') m.clear_queues() print('\nIncluding columns: ' + str(columns)) print('\nTarget columns: ' + str(targets)) P, T = parse_file(filename, targetcols=targets, inputcols=columns, normalize=True, separator=separator, use_header=True) if testfilename is not None: Ptest, Ttest = parse_file(testfilename, targetcols=targets, inputcols=columns, normalize=True, separator=separator, use_header=True) else: Ptest, Ttest = None, None print("\nData set:") print("Number of patients with events: " + str(T[:, 1].sum())) print("Number of censored patients: " + str((1 - T[:, 1]).sum())) print("T:" + str(T.shape)) print("P:" + str(P.shape)) if (Ptest is not None and Ttest is not None): print("\nExternal Test Data set:") print("Number of patients with events: " + str(Ttest[:, 1].sum())) print("Number of censored patients: " + str((1 - Ttest[:, 1]).sum())) print("Ttest:" + str(Ttest.shape)) print("Ptest:" + str(Ptest.shape)) comsize = 3 * comsize_third #Make sure it is divisible by three print('\nNumber of members in each committee: ' + str(comsize)) print('Designs used in testing (size, function): ' + str(designs)) # We can generate a test set from the data set, but usually we don't want that # Leave at 1 for no test set. val_pieces = 1 print('Cross-test pieces: ' + str(val_pieces)) cross_times = repeat_times print('Number of times to repeat procedure: ' + str(cross_times)) #try: # pop_size = input('Population size [50]: ') #except SyntaxError as e: if 'population_size' not in train_kwargs: train_kwargs['population_size'] = 50 #try: # mutation_rate = input('Please input a mutation rate (0.25): ') #except SyntaxError as e: if 'mutation_chance' not in train_kwargs: train_kwargs['mutation_chance'] = 0.25 #try: # epochs = input("Number of generations (200): ") #except SyntaxError as e: if 'epochs' not in train_kwargs: train_kwargs['epochs'] = 100 for k, v in train_kwargs.iteritems(): print(str(k) + ": " + str(v)) print('\n Job status:\n') count = 0 all_counts = [] all_jobs = {} tests = {} #trn_set = {} trn_idx = {} all_best = [] all_best_com_val = [] all_best_avg_trn = [] all_best_avg_val = [] all_best_design = [] all_best_test = [] #Lambda times for _time in xrange(cross_times): #Get an independant test set, 1/tau of the total. super_set, super_indices = get_cross_validation_sets( P, T, val_pieces, binary_column=1, return_indices=True) super_zip = zip(super_set, super_indices) all_best.append({}) all_best_com_val.append({}) all_best_avg_trn.append({}) all_best_avg_val.append({}) all_best_design.append({}) all_best_test.append({}) best = all_best[_time] best_com_val = all_best_com_val[_time] best_avg_trn = all_best_avg_trn[_time] best_avg_val = all_best_avg_val[_time] best_design = all_best_design[_time] best_test = all_best_test[_time] #For every blind test group for (((TRN, TEST), (TRN_IDX, TEST_IDX)), _t) in zip(super_zip, xrange(len(super_set))): TRN_INPUTS = TRN[0] TRN_TARGETS = TRN[1] TEST_INPUTS = TEST[0] TEST_TARGETS = TEST[1] #run each architecture design on a separate machine best[_t] = None best_com_val[_t] = 0 best_avg_trn[_t] = 0 best_avg_val[_t] = 0 best_design[_t] = None best_test[_t] = None for design in designs: count += 1 all_counts.append(count) (netsize, hidden_func) = design com = build_feedforward_committee(comsize, len(P[0]), netsize, 1, hidden_function=hidden_func, output_function='linear') tests[count] = (TEST_INPUTS, TEST_TARGETS) #trn_set[count] = (TRN_INPUTS, TRN_TARGETS) #print("TRN_IDX" + str(TRN_IDX)) #print("TEST_IDX" + str(TEST_IDX)) trn_idx[count] = TRN_IDX #1 is the column in the target array which holds the binary censoring information job = m.assemblejob((count, _time, _t, design), train_committee, com, train_evolutionary, TRN_INPUTS, TRN_TARGETS, binary_target=1, error_function=c_index_error, **train_kwargs) all_jobs[count] = job m.sendjob(job[0], job[1], *job[2], **job[3]) while (count > 0): print('Remaining jobs: {0}'.format(all_counts)) if fastest_done is None: ID, RESULT = m.getresult() #Blocks fastest_done = time.time() - starting_time else: RETURNVALUE = m.get_waiting_result(2 * fastest_done) if RETURNVALUE is not None: ID, RESULT = RETURNVALUE else: print( 'Timed out after {0} seconds. Putting remaining jobs {1} back on the queue.\n \ You should restart the server after this session.'.format( fastest_done, all_counts)) for _c in all_counts: job = all_jobs[_c] m.sendjob(job[0], job[1], *job[2], **job[3]) continue #Jump to next iteration print('Result received! Processing...') _c, _time, _t, design = ID (com, trn_errors, vald_errors, internal_sets, internal_sets_indices) = RESULT if _c not in all_counts: print('This result [{0}] has already been processed.'.format(_c)) continue count -= 1 TEST_INPUTS, TEST_TARGETS = tests[_c] #TRN_INPUTS, TRN_TARGETS = trn_set[_c] TRN_IDX = trn_idx[_c] all_counts.remove(_c) com.set_training_sets([ _set[0][0] for _set in internal_sets ]) #first 0 gives training sets, second 0 gives inputs. #Now what we'd like to do is get the value for each patient in the #validation set, for all validation sets. Then I'd like to average the #result for each such patient, over the different validation sets. allpats = [] allpats.extend(internal_sets[0][0][0]) #Extend with training inputs allpats.extend(internal_sets[0][1][0]) #Extend with validation inputs allpats_targets = [] allpats_targets.extend(internal_sets[0][0][1]) #training targets allpats_targets.extend(internal_sets[0][1][1]) #validation targets allpats_targets = numpy.array(allpats_targets) patvals = [[] for bah in xrange(len(allpats))] #print(len(patvals)) #print(len(internal_sets_indices)) #1 for the validation set. Was given to the com.nets in the same type of iteration, so order is same # Will be order consistent with P and T for ((trn_in, trn_tar), (val_in, val_tar)), idx, net in zip(internal_sets, internal_sets_indices, com.nets): _C_ = -1 for valpat in val_in: _C_ += 1 i = TRN_IDX[idx[1][_C_]] pat = P[i] #print("Facit: \n" + str(valpat)) #print("_C_ = " + str(_C_)) #print("i: " + str(i)) #print("P[TRN_IDX[i]] : " + str(pat)) assert ((pat == valpat).all()) patvals[i].append(com.risk_eval(pat, net=net)) #Need double brackets for dimensions to fit C-module avg_vals = numpy.array([[numpy.mean(patval)] for patval in patvals]) #Now we have average validation ranks. do C-index on this avg_val_c_index = get_C_index(T, avg_vals) trn_errors = numpy.array(trn_errors.values(), dtype=numpy.float64)**-1 vald_errors = numpy.array(vald_errors.values(), dtype=numpy.float64)**-1 avg_trn = numpy.mean(trn_errors) avg_val = numpy.mean(vald_errors) best = all_best[_time] best_com_val = all_best_com_val[_time] best_avg_trn = all_best_avg_trn[_time] best_avg_val = all_best_avg_val[_time] best_design = all_best_design[_time] best_test = all_best_test[_time] if avg_val_c_index > best_com_val[_t]: best[_t] = com best_com_val[_t] = avg_val_c_index best_avg_trn[_t] = avg_trn best_avg_val[_t] = avg_val best_design[_t] = design best_test[_t] = tests[_c] print('\nWinning designs') winnerfilename = '.winningdesigns_{0:.0f}.csv'.format(time.time()) with open(winnerfilename, 'w') as F: print( 'Average Training Perf, Average Validation Perf, Average Committee Validation Perf, Test Perf, Design:' ) F.write( 'Average Training Perf, Average Validation Perf, Average Committee Validation Perf, Test Perf, Design\n' ) for _time in xrange(len(all_best)): best = all_best[_time] best_com_val = all_best_com_val[_time] best_avg_trn = all_best_avg_trn[_time] best_avg_val = all_best_avg_val[_time] best_design = all_best_design[_time] best_test = all_best_test[_time] for _t in best.keys(): TEST_INPUTS, TEST_TARGETS = best_test[_t] com = best[_t] if len(TEST_INPUTS) > 0: #Need double brackets for dimensions to be right for numpy outputs = numpy.array([[com.risk_eval(inputs)] for inputs in TEST_INPUTS]) test_c_index = get_C_index(TEST_TARGETS, outputs) elif Ptest is not None and Ttest is not None: #Need double brackets for dimensions to be right for numpy outputs = numpy.array([[com.risk_eval(inputs)] for inputs in Ptest]) test_c_index = get_C_index(Ttest, outputs) else: test_c_index = 0 print('{trn}, {val}, {com_val}, {test}, {dsn}'.format( trn=best_avg_trn[_t], val=best_avg_val[_t], com_val=best_com_val[_t], test=test_c_index, dsn=best_design[_t])) F.write('{trn}, {val}, {com_val}, {test}, {dsn}\n'.format( trn=best_avg_trn[_t], val=best_avg_val[_t], com_val=best_com_val[_t], test=test_c_index, dsn=best_design[_t])) return winnerfilename
def train_model(design, filename, columns, targets, comsize_third = 20, separator = '\t', **train_kwargs): ''' train_model(design, filename, columns, targets) Given a design, will train a committee like that on the data specified. Will save the committee as '.design_time.pcom' where design is replaced by the design and time is replaced by a string of numbers from time() Returns this filename ''' starting_time = time.time() fastest_done = None m = Master() #m.connect('gibson.thep.lu.se', 'science') m.connect('130.235.189.249', 'science') print('Connected to server') m.clear_queues() savefile = ".{nodes}_{a_func}_{time:.0f}.pcom".format(nodes = design[0], a_func = design[1], time = time.time()) print('\nIncluding columns: ' + str(columns)) print('Target columns: ' + str(targets)) P, T = parse_file(filename, targetcols = targets, inputcols = columns, normalize = True, separator = separator, use_header = True) #columns = (2, -6, -5, -4, -3, -2, -1) #_P, T = parse_file(filename, targetcols = [4, 5], inputcols = (2, -4, -3, -2, -1), ignorerows = [0], normalize = True) #P, _T = parse_file(filename, targetcols = [4], inputcols = columns, ignorerows = [0], normalize = True) print("\nData set:") print("Number of patients with events: " + str(T[:, 1].sum())) print("Number of censored patients: " + str((1 - T[:, 1]).sum())) comsize = 3 * comsize_third #Make sure it is divisible by three (3*X will create X jobs) print('Number of members in the committee: ' + str(comsize)) print('Design used (size, function): ' + str(design)) #try: # pop_size = input('Population size [50]: ') #except SyntaxError as e: if 'population_size' not in train_kwargs: train_kwargs['population_size'] = 200 #print("Population size: " + str(train_kwargs['population_size'])) #try: # mutation_rate = input('Please input a mutation rate (0.25): ') #except SyntaxError as e: if 'mutation_chance' not in train_kwargs: train_kwargs['mutation_chance'] = 0.25 #print("Mutation rate: " + str(train_kwargs['mutation_chance'])) #try: # epochs = input("Number of generations (200): ") #except SyntaxError as e: if 'epochs' not in train_kwargs: train_kwargs['epochs'] = 100 for k, v in train_kwargs.iteritems(): print(str(k) + ": " + str(v)) #errorfunc = weighted_c_index_error errorfunc = c_index_error print("\nError function: " + errorfunc.__name__) print('\n Job status:\n') count = 0 all_counts = [] all_jobs = {} #trn_set = {} trn_idx = {} master_com = None allpats = P.copy() #allpats[:, 1] = 1 #This is the event column allpats_targets = T patvals = [[] for bah in xrange(len(allpats))] #Lambda times for _time in xrange(1): #Get an independant test set, 1/tau of the total. super_set, super_indices = get_cross_validation_sets(P, T, 1, binary_column = 1, return_indices = True) super_zip = zip(super_set, super_indices) #For every blind test group for (((TRN, TEST), (TRN_IDX, TEST_IDX)), _t) in zip(super_zip, xrange(len(super_set))): TRN_INPUTS = TRN[0] TRN_TARGETS = TRN[1] #TEST_INPUTS = TEST[0] #TEST_TARGETS = TEST[1] for com_num in xrange(comsize / 3): count += 1 all_counts.append(count) #trn_set[count] = (TRN_INPUTS, TRN_TARGETS) trn_idx[count] = TRN_IDX (netsize, hidden_func) = design com = build_feedforward_committee(3, len(P[0]), netsize, 1, hidden_function = hidden_func, output_function = 'linear') #1 is the column in the target array which holds the binary censoring information job = m.assemblejob((count, _time, _t, design), train_committee, com, train_evolutionary, TRN_INPUTS, TRN_TARGETS, binary_target = 1, error_function = errorfunc, **train_kwargs) all_jobs[count] = job m.sendjob(job[0], job[1], *job[2], **job[3]) #TIME TO RECEIVE THE RESULTS while(count > 0): print('Remaining jobs: {0}'.format(all_counts)) if fastest_done is None: ID, RESULT = m.getresult() #Blocks fastest_done = time.time() - starting_time else: RETURNVALUE = m.get_waiting_result(2 * fastest_done) if RETURNVALUE is not None: ID, RESULT = RETURNVALUE else: print('Timed out after {0} seconds. Putting remaining jobs {1} back on the queue.\nYou should restart \ the server after this session.'.format(fastest_done, all_counts)) for _c in all_counts: job = all_jobs[_c] m.sendjob(job[0], job[1], *job[2], **job[3]) continue #Jump to next iteration print('Result received! Processing...') _c, _time, _t, design = ID (com, trn_errors, vald_errors, internal_sets, internal_sets_indices) = RESULT if _c not in all_counts: print('This result [{0}] has already been processed.'.format(_c)) continue count -= 1 #TRN_INPUTS, TRN_TARGETS = trn_set[_c] TRN_IDX = trn_idx[_c] all_counts.remove(_c) com.set_training_sets([_set[0][0] for _set in internal_sets]) #first 0 gives training sets, second 0 gives inputs. if master_com is None: master_com = com else: master_com.nets.extend(com.nets) #Add this batch of networks #Now what we'd like to do is get the value for each patient in the #validation set, for all validation sets. Then I'd like to average the #result for each such patient, over the different validation sets. #1 for the validation set. Was given to the com.nets in the same type of iteration, so order is same # patvals will be order-consistent with P and T #for (_trn_set_indices, val_set_indices), net in zip(internal_sets_indices, com.nets): # for i in val_set_indices: # patvals_new[TRN_IDX[i]].append(com.risk_eval(P[TRN_IDX[i]], net = net)) for ((trn_in, trn_tar), (val_in, val_tar)), idx, net in zip(internal_sets, internal_sets_indices, com.nets): _C_ = -1 for valpat in val_in: _C_ += 1 i = TRN_IDX[idx[1][_C_]] pat = P[i] #print("Facit: \n" + str(valpat)) #print("_C_ = " + str(_C_)) #print("i: " + str(i)) #print("P[TRN_IDX[i]] : " + str(pat)) assert((pat == valpat).all()) patvals[i].append(com.risk_eval(pat, net = net)) #for pat, i in zip(allpats, xrange(len(patvals))): #We could speed this up by only reading every third dataset, but I'm not sure if they are ordered correctly... # for ((trn_in, trn_tar), (val_in, val_tar)), idx, net in zip(internal_sets, internal_sets_indices, com.nets): # _C_ = -1 # for valpat in val_in: # _C_ += 1 # if (pat == valpat).all(): #Checks each variable individually, all() does a boolean and between the results #print("Facit: \n" + str(valpat)) #print("Allpats-index = " + str(i)) #print("_C_ = " + str(_C_)) #print("idx_val[_C_]: " + str(idx[1][_C_])) #print("TRN_IDX[i]: " + str(TRN_IDX[idx[1][_C_]])) #print("P[TRN_IDX[i]] : " + str(P[TRN_IDX[idx[1][_C_]]])) # patvals[i].append(com.risk_eval(pat, net = net)) #Just to have something to count # break #Done with this data_set avg_vals = numpy.array([[numpy.mean(patval)] for patval in patvals]) #Need double brackets for dimensions to fit C-module #Now we have average validation ranks. do C-index on this avg_val_c_index = get_C_index(allpats_targets, avg_vals) print('Average com-validation C-Index so far : {0}'.format(avg_val_c_index)) print('Saving committee so far in {0}'.format(savefile)) with open(savefile, 'w') as FILE: pickle.dump(master_com, FILE) return savefile
def train_model(filename, columns, targets, separator='\t', comsize=1): ''' train_model(design, filename, columns, targets) Given a design, will train a committee like that on the data specified. Will save the committee as '.design_time.pcom' where design is replaced by the design and time is replaced by a string of numbers from time() Returns this filename ''' headers = [] headers.extend(columns) headers.extend(targets) #Add targets to the end targetcol = targets[0] eventcol = targets[1] savefile = ".cox_{time:.0f}.pcom".format(time=time.time()) print('\nIncluding columns: ' + str(columns)) print('Target columns: ' + str(targets)) P, T = parse_file(filename, targetcols=targets, inputcols=columns, normalize=False, separator=separator, use_header=True) #columns = (2, -6, -5, -4, -3, -2, -1) #_P, T = parse_file(filename, targetcols = [4, 5], inputcols = (2, -4, -3, -2, -1), ignorerows = [0], normalize = True) #P, _T = parse_file(filename, targetcols = [4], inputcols = columns, ignorerows = [0], normalize = True) print("\nData set:") print("Number of patients with events: " + str(T[:, 1].sum())) print("Number of censored patients: " + str((1 - T[:, 1]).sum())) print('Number of members in the committee: ' + str(comsize)) allpats = P.copy() #allpats[:, 1] = 1 #This is the event column allpats_targets = T patvals = [[] for bah in xrange(len(allpats))] cox_committee = None #Get an independant test set, 1/tau of the total. super_set = get_cross_validation_sets(P, T, 1, binary_column=1) #For every blind test group for ((TRN, TEST), _t) in zip(super_set, xrange(len(super_set))): TRN_INPUTS = TRN[0] TRN_TARGETS = TRN[1] #TEST_INPUTS = TEST[0] #TEST_TARGETS = TEST[1] #Modulo expressions mean we can deal with any number of committees, not only multiples of three _res = 1 if comsize == 1 else 0 for com_num in xrange( int(comsize / 3) + int((comsize % 3) / 2) + _res): #Every time in the loop, create new validations sets of size 1/3. 3 everytime _tmp_val_sets = get_cross_validation_sets(TRN_INPUTS, TRN_TARGETS, 3, binary_column=1) val_sets = [] if int(comsize / 3) > 0: _max = 3 else: _max = int((comsize % 3) / 2) * 2 + _res for _tmp_val_set in _tmp_val_sets[:_max]: ((trn_in, trn_tar), (val_in, val_tar)) = _tmp_val_set #Add target columns to the end _trn = np.append(trn_in, trn_tar, axis=1) _val = np.append(val_in, val_tar, axis=1) val_sets.append((_trn, _val)) #And create 3 cox models, one for each validation tmp_com = committee(val_sets, targetcol, eventcol, headers) print("Adding this many members: " + str(len(tmp_com))) if cox_committee is None: cox_committee = tmp_com else: #Extend the big committee cox_committee.members.extend(tmp_com.members) #Now what we'd like to do is get the value for each patient in the #validation set, for all validation sets. Then I'd like to average the #result for each such patient, over the different validation sets. print("Validating cox committee, this might take a little while...") _count = 0 if len(cox_committee) < 3: allpats_targets = np.empty( (0, 2)) #All patients won't be in the target set in this case for pat, i in zip(allpats, xrange(len(patvals))): if _count % 50 == 0: print("{0} / {1}".format(_count, len(patvals))) _count += 1 #We could speed this up by only reading every third dataset, but I'm not sure if they are ordered correctly... for cox in cox_committee.members: (_trn, _val) = cox.internal_set trn_in = _trn[:, :-2] #Last two columns are targets val_in = _val[:, :-2] val_tar = _val[:, -2:] for valpat, valtar in zip(val_in, val_tar): if (pat == valpat).all( ): #Checks each variable individually, all() does a boolean and between the results patvals[i].append(cox_committee.risk_eval( pat, cox=cox)) #Just to have something to count if len(cox_committee) < 3: allpats_targets = np.append(allpats_targets, [valtar], axis=0) #print cox_committee.risk_eval(pat, cox = cox) break #Done with this data_set avg_vals = [] for patval in patvals: if len(patval) > 0: avg_vals.append([np.mean(patval)]) avg_vals = np.array(avg_vals) #avg_vals = np.array([[np.mean(patval)] for patval in patvals]) #Need double brackets for dimensions to fit C-module #Now we have average validation ranks. do C-index on this avg_val_c_index = get_C_index(allpats_targets, avg_vals) print('Average validation C-Index: {0}'.format(avg_val_c_index)) print('Saving committee in {0}'.format(savefile)) with open(savefile, 'w') as FILE: pickle.dump(cox_committee, FILE) return savefile
def test_model_arrays(savefile, filename, P, T, **kwargs): with open(savefile, "r") as FILE: master_com = pickle.load(FILE) print("Committee size: {0}".format(len(master_com))) output_file = "test_{0}_{1}.cvs".format( os.path.splitext(os.path.basename(savefile))[0], os.path.splitext(os.path.basename(filename))[0] ) # Need double brackets for dimensions to be right for numpy outputs = numpy.array([[master_com.risk_eval(inputs)] for inputs in P]) if T is None or len(T) == 0: with open(output_file, "w") as F: # print('Targets\tOutputs\tEvents:') F.write("Outputs\n") for o in outputs: # print("{0}\t{1}\t{2}".format(t[0], o[0], t[1])) F.write("{0}\n".format(o[0])) return outputs c_index = get_C_index(T, outputs) print("C-Index: {0}".format(c_index)) # if len(sys.argv) > 2: # thresholds = [float(t) for t in sys.argv[2:]] # else: thresholds = None # Calculate suitable size for the figure for use in LaTEX fig_width_pt = 396.0 # Get this from LaTeX using \showthe\columnwidth inches_per_pt = 1.0 / 72.27 # Convert pt to inch golden_mean = (sqrt(5) - 1.0) / 2.0 # Aesthetic ratio fig_width = fig_width_pt * inches_per_pt # width in inches fig_height = fig_width * golden_mean # height in inches fig_size = [fig_width, fig_height] # Update settings plt.rcParams["figure.figsize"] = fig_size th = kaplanmeier( time_array=T[:, 0], event_array=T[:, 1], output_array=outputs, threshold=thresholds, show_plot=False, bestcut=False, **kwargs ) # print("Threshold dividing the set in two equal pieces: " + str(th)) if plt: plt.savefig( "kaplanmeier_{0}_{1}.eps".format( os.path.splitext(os.path.basename(savefile))[0], os.path.splitext(os.path.basename(filename))[0] ) ) with open(output_file, "w") as F: # print('Targets\tOutputs\tEvents:') F.write("Targets,Outputs,Events\n") for t, o in zip(T, outputs): # print("{0}\t{1}\t{2}".format(t[0], o[0], t[1])) F.write("{0},{1},{2}\n".format(t[0], o[0], t[1])) return output_file
def train_model(design, filename, columns, targets, comsize_third=20, separator='\t', **train_kwargs): ''' train_model(design, filename, columns, targets) Given a design, will train a committee like that on the data specified. Will save the committee as '.design_time.pcom' where design is replaced by the design and time is replaced by a string of numbers from time() Returns this filename ''' starting_time = time.time() fastest_done = None m = Master() #m.connect('gibson.thep.lu.se', 'science') m.connect('130.235.189.249', 'science') print('Connected to server') m.clear_queues() savefile = ".{nodes}_{a_func}_{time:.0f}.pcom".format(nodes=design[0], a_func=design[1], time=time.time()) print('\nIncluding columns: ' + str(columns)) print('Target columns: ' + str(targets)) P, T = parse_file(filename, targetcols=targets, inputcols=columns, normalize=True, separator=separator, use_header=True) #columns = (2, -6, -5, -4, -3, -2, -1) #_P, T = parse_file(filename, targetcols = [4, 5], inputcols = (2, -4, -3, -2, -1), ignorerows = [0], normalize = True) #P, _T = parse_file(filename, targetcols = [4], inputcols = columns, ignorerows = [0], normalize = True) print("\nData set:") print("Number of patients with events: " + str(T[:, 1].sum())) print("Number of censored patients: " + str((1 - T[:, 1]).sum())) comsize = 3 * comsize_third #Make sure it is divisible by three (3*X will create X jobs) print('Number of members in the committee: ' + str(comsize)) print('Design used (size, function): ' + str(design)) #try: # pop_size = input('Population size [50]: ') #except SyntaxError as e: if 'population_size' not in train_kwargs: train_kwargs['population_size'] = 200 #print("Population size: " + str(train_kwargs['population_size'])) #try: # mutation_rate = input('Please input a mutation rate (0.25): ') #except SyntaxError as e: if 'mutation_chance' not in train_kwargs: train_kwargs['mutation_chance'] = 0.25 #print("Mutation rate: " + str(train_kwargs['mutation_chance'])) #try: # epochs = input("Number of generations (200): ") #except SyntaxError as e: if 'epochs' not in train_kwargs: train_kwargs['epochs'] = 100 for k, v in train_kwargs.iteritems(): print(str(k) + ": " + str(v)) #errorfunc = weighted_c_index_error errorfunc = c_index_error print("\nError function: " + errorfunc.__name__) print('\n Job status:\n') count = 0 all_counts = [] all_jobs = {} #trn_set = {} trn_idx = {} master_com = None allpats = P.copy() #allpats[:, 1] = 1 #This is the event column allpats_targets = T patvals = [[] for bah in xrange(len(allpats))] #Lambda times for _time in xrange(1): #Get an independant test set, 1/tau of the total. super_set, super_indices = get_cross_validation_sets( P, T, 1, binary_column=1, return_indices=True) super_zip = zip(super_set, super_indices) #For every blind test group for (((TRN, TEST), (TRN_IDX, TEST_IDX)), _t) in zip(super_zip, xrange(len(super_set))): TRN_INPUTS = TRN[0] TRN_TARGETS = TRN[1] #TEST_INPUTS = TEST[0] #TEST_TARGETS = TEST[1] for com_num in xrange(comsize / 3): count += 1 all_counts.append(count) #trn_set[count] = (TRN_INPUTS, TRN_TARGETS) trn_idx[count] = TRN_IDX (netsize, hidden_func) = design com = build_feedforward_committee(3, len(P[0]), netsize, 1, hidden_function=hidden_func, output_function='linear') #1 is the column in the target array which holds the binary censoring information job = m.assemblejob((count, _time, _t, design), train_committee, com, train_evolutionary, TRN_INPUTS, TRN_TARGETS, binary_target=1, error_function=errorfunc, **train_kwargs) all_jobs[count] = job m.sendjob(job[0], job[1], *job[2], **job[3]) #TIME TO RECEIVE THE RESULTS while (count > 0): print('Remaining jobs: {0}'.format(all_counts)) if fastest_done is None: ID, RESULT = m.getresult() #Blocks fastest_done = time.time() - starting_time else: RETURNVALUE = m.get_waiting_result(2 * fastest_done) if RETURNVALUE is not None: ID, RESULT = RETURNVALUE else: print( 'Timed out after {0} seconds. Putting remaining jobs {1} back on the queue.\nYou should restart \ the server after this session.'.format(fastest_done, all_counts)) for _c in all_counts: job = all_jobs[_c] m.sendjob(job[0], job[1], *job[2], **job[3]) continue #Jump to next iteration print('Result received! Processing...') _c, _time, _t, design = ID (com, trn_errors, vald_errors, internal_sets, internal_sets_indices) = RESULT if _c not in all_counts: print('This result [{0}] has already been processed.'.format(_c)) continue count -= 1 #TRN_INPUTS, TRN_TARGETS = trn_set[_c] TRN_IDX = trn_idx[_c] all_counts.remove(_c) com.set_training_sets([ _set[0][0] for _set in internal_sets ]) #first 0 gives training sets, second 0 gives inputs. if master_com is None: master_com = com else: master_com.nets.extend(com.nets) #Add this batch of networks #Now what we'd like to do is get the value for each patient in the #validation set, for all validation sets. Then I'd like to average the #result for each such patient, over the different validation sets. #1 for the validation set. Was given to the com.nets in the same type of iteration, so order is same # patvals will be order-consistent with P and T #for (_trn_set_indices, val_set_indices), net in zip(internal_sets_indices, com.nets): # for i in val_set_indices: # patvals_new[TRN_IDX[i]].append(com.risk_eval(P[TRN_IDX[i]], net = net)) for ((trn_in, trn_tar), (val_in, val_tar)), idx, net in zip(internal_sets, internal_sets_indices, com.nets): _C_ = -1 for valpat in val_in: _C_ += 1 i = TRN_IDX[idx[1][_C_]] pat = P[i] #print("Facit: \n" + str(valpat)) #print("_C_ = " + str(_C_)) #print("i: " + str(i)) #print("P[TRN_IDX[i]] : " + str(pat)) assert ((pat == valpat).all()) patvals[i].append(com.risk_eval(pat, net=net)) #for pat, i in zip(allpats, xrange(len(patvals))): #We could speed this up by only reading every third dataset, but I'm not sure if they are ordered correctly... # for ((trn_in, trn_tar), (val_in, val_tar)), idx, net in zip(internal_sets, internal_sets_indices, com.nets): # _C_ = -1 # for valpat in val_in: # _C_ += 1 # if (pat == valpat).all(): #Checks each variable individually, all() does a boolean and between the results #print("Facit: \n" + str(valpat)) #print("Allpats-index = " + str(i)) #print("_C_ = " + str(_C_)) #print("idx_val[_C_]: " + str(idx[1][_C_])) #print("TRN_IDX[i]: " + str(TRN_IDX[idx[1][_C_]])) #print("P[TRN_IDX[i]] : " + str(P[TRN_IDX[idx[1][_C_]]])) # patvals[i].append(com.risk_eval(pat, net = net)) #Just to have something to count # break #Done with this data_set avg_vals = numpy.array([ [numpy.mean(patval)] for patval in patvals ]) #Need double brackets for dimensions to fit C-module #Now we have average validation ranks. do C-index on this avg_val_c_index = get_C_index(allpats_targets, avg_vals) print('Average com-validation C-Index so far : {0}'.format( avg_val_c_index)) print('Saving committee so far in {0}'.format(savefile)) with open(savefile, 'w') as FILE: pickle.dump(master_com, FILE) return savefile
def scatterplot_files(targetfile, targetcol, eventcol, modelfile, modeloutputcol, **kwargs): ''' scatterplot_files(targetfile, targetcol, eventcol, modelfile, modeloutputcol) Takes two files because the target data and model data is allowed to be in different files. Events are ONLY taken from target data. Writes two files: scatter_cens_targetfile_modelfile.eps scatter_nocens_targetfile_modelfile.eps ''' #Calculate suitable size for the figure for use in LaTEX fig_width_pt = 396.0 # Get this from LaTeX using \showthe\columnwidth inches_per_pt = 1.0/72.27 # Convert pt to inch golden_mean = (sqrt(5)-1.0)/2.0 # Aesthetic ratio fig_width = fig_width_pt*inches_per_pt # width in inches fig_height = fig_width*golden_mean # height in inches fig_size = [fig_width,fig_height] #Update settings plt.rcParams['figure.figsize'] = fig_size #params = {'axes.labelsize': 10, # 'text.fontsize': 10, # 'legend.fontsize': 10, # 'xtick.labelsize': 8, # 'ytick.labelsize': 8, #'text.usetex': True, # 'figure.figsize': fig_size} #plt.rcParams.update(params) # with open(targetfile, 'r') as f: # X_in = [line.split() for line in f.readlines()] # X_in = numpy.array(X_in) # X = X_in[1:, first_col] # X = numpy.array(X, dtype = 'float') data = np.array(read_data_file(targetfile, ",")) T, t = parse_data(data, inputcols = (targetcol, eventcol), ignorerows = [0], normalize = False) X = T[:, 0] events = T[:, 1] # with open(modeloutputcol, 'r') as f: # Y_in = [line.split() for line in f.readlines()] # # Y_in = numpy.array(Y_in) # Y = Y_in[1:, second_col] # Y = numpy.array(Y, dtype = 'float') data = np.array(read_data_file(modelfile, ",")) D, t = parse_data(data, inputcols = [modeloutputcol], ignorerows = [0], normalize = False) Y = D[:, 0] # if event_col is not None: # events = X_in[1:, event_col] # events = numpy.array(events, dtype = 'float') # print 'Using events' # else: # events = None # T = numpy.empty((len(X), 2), dtype='float') # T[:, 0] = X # T[:, 1] = events outputs = np.empty((len(X), 2), dtype='float') outputs[:, 0 ] = Y outputs[:, 1] = events c_index = get_C_index(T, outputs) print("C-Index between these files is: {0}".format(c_index)) scatter(X, Y, events = events, x_label = 'Targets', y_label = 'Model output', gridsize = 30, mincnt = 0, show_plot = False) #plt.xlabel(os.path.basename(sys.argv[1]) + "\nC-Index between these files is: {0}".format(c_index)) #plt.ylabel('Correlation of ' + os.path.basename(sys.argv[2])) plt.savefig('scatter_cens_cind_{cindex}_{0}_{1}.eps'.format(os.path.splitext(os.path.basename(modelfile))[0], os.path.splitext(os.path.basename(targetfile))[0], cindex=c_index)) scatter(X, Y, x_label = 'Targets', y_label = 'Model output', gridsize = 30, mincnt = 0, show_plot = False) #plt.xlabel(os.path.basename(sys.argv[1]) + "\nC-Index between these files is: {0}".format(c_index)) #plt.ylabel('Correlation of ' + os.path.basename(sys.argv[2])) plt.savefig('scatter_nocens_{cindex}_{0}_{1}.eps'.format(os.path.splitext(os.path.basename(modelfile))[0], os.path.splitext(os.path.basename(targetfile))[0], cindex=c_index))