def train_cv(datapos, dataneg, name, outdir, boruta_arg): len_neg = len(dataneg) auc = 0 cut_at = 0.0 for x in xrange(N_repeats): if len_neg > len(datapos)*(N_repeats-1): print "splitting negatives into %d groups" % N_repeats pocz = int(round(x*len_neg/N_repeats)) kon = int(round((x+1)*len_neg/N_repeats)) data, target = train.join_and_balance(datapos, cut_rows(dataneg, range(pocz,kon))) else: data, target = train.join_and_balance(datapos, dataneg) data, names = my_transpose(data) result = train.do_cross_validation(data, cv_folds, target, "random_forest", N_trees, name, outdir) auc += result[0] cut_at += result[1] print "Mean AUC in %d repeats = %f" % (N_repeats, auc/N_repeats) print "Mean cut value = %f" % (cut_at/N_repeats) sys.stdout.flush() if boruta_arg: boruta.run_boruta(data, target, names, name, outdir) return auc/N_repeats, cut_at/N_repeats
def train_cv(datapos, dataneg, name, outdir, boruta_arg): len_neg = len(dataneg) auc = 0 cut_at = 0.0 for x in xrange(N_repeats): if len_neg > len(datapos) * (N_repeats - 1): print "splitting negatives into %d groups" % N_repeats pocz = int(round(x * len_neg / N_repeats)) kon = int(round((x + 1) * len_neg / N_repeats)) data, target = train.join_and_balance( datapos, cut_rows(dataneg, range(pocz, kon))) else: data, target = train.join_and_balance(datapos, dataneg) data, names = my_transpose(data) result = train.do_cross_validation(data, cv_folds, target, "random_forest", N_trees, name, outdir) auc += result[0] cut_at += result[1] print "Mean AUC in %d repeats = %f" % (N_repeats, auc / N_repeats) print "Mean cut value = %f" % (cut_at / N_repeats) sys.stdout.flush() if boruta_arg: boruta.run_boruta(data, target, names, name, outdir) return auc / N_repeats, cut_at / N_repeats
def train_save(datapos, dataneg, name, outdir): data, target = train.join_and_balance(datapos, dataneg) data, names = my_transpose(data) auc = train.save_class(data, target, "random_forest", N_trees, name, outdir) sys.stdout.flush()
def predict_cv_bothpromoters(pos1, neg1a, neg1b, pos2a, neg2a, pos2b, neg2b): #pos1 and pos2 are ordered, each sequence pos1[i] corresponds to i-th sequence in pos2[i] print pos1.shape[0], pos2a.shape[0], neg1a.shape[0], neg2a.shape[0] len_n = min(pos1.shape[0], pos2a.shape[0], neg1a.shape[0], neg2a.shape[0]) target = [1]*len_n + [0]*len_n data1a, data1b, data2 = join(pos1, neg1a, neg1b, pos2a, neg2a) kf = cross_validation.StratifiedKFold(target, cv_folds) predictions_pos = [None]*len_n predictions_neg = [None]*len_n for train_index, test_index in kf: data_train = data1a[[train_index]] data_test = data1a[[test_index]] data_test2 = data1b[[test_index]] target_train = [target[index] for index in train_index] target_test = [target[index] for index in test_index] npos = sum([1 for i in train_index if target[i] == 1]) nneg = sum([1 for i in train_index if target[i] == 0]) print npos, nneg, pos2b.shape[0], neg2b.shape[0] if npos > pos2b.shape[0] or nneg > neg2b.shape[0]: nsamples = min(pos2b.shape[0], neg2b.shape[0]) new_train_index = sample(train_index, nsamples) data_train2 = data2[[new_train_index]] data_train2b, _ = join_and_balance(pos2b[[sample(range(pos2b.shape[0]), nsamples)]], neg2b[[sample(range(neg2b.shape[0]), nsamples)]], balance=False) target_train2 = [target[index] for index in new_train_index] +[1]*nsamples+[0]*nsamples else: data_train2 = data2[[train_index]] data_train2b, _ = join_and_balance(pos2b[[sample(range(pos2b.shape[0]), npos)]], neg2b[[sample(range(neg2b.shape[0]), nneg)]], balance=False) target_train2 = target_train +[1]*npos+[0]*nneg data_train2b = my_transpose(data_train2b)[0] data_train2 = numpy.append(data_train2, data_train2b, axis=0) predicted = train_random_forest(data_train, target_train, data_test, N_trees) predicted2 = train_random_forest(data_train2, target_train2, data_test2, N_trees) #print [x[1] for x in predicted] #print [x[1] for x in predicted2] predicted3 = combine_predictions(predicted, predicted2) #predicted3 = predicted #one step! for x in xrange(len(test_index)): if target[test_index[x]] == 1: predictions_pos[test_index[x]] = predicted3[x][1] #print '\t'.join(map(str, ['pos', x, test_index[x], target[test_index[x]], predicted[x,1], predicted2[x,1], predicted3[x][1]])) else: #print '\t'.join(map(str, ['neg', x, test_index[x], target[test_index[x]], predicted[x,1], predicted2[x,1], predicted3[x][1]])) predictions_neg[test_index[x]-len_n] = predicted3[x][1] return predictions_pos, predictions_neg
def getclass_bothpromoters(pos2a, neg2a, pos2b, neg2b): n = min(pos2a.shape[0], pos2b.shape[0], neg2a.shape[0], neg2b.shape[0]) target = [1]*n*2 + [0]*n*2 data1, _ = join_and_balance(choose(pos2a, n), choose(pos2b, n), balance=False) data2, _ = join_and_balance(choose(neg2a, n), choose(neg2b, n), balance=False) data, _ = join_and_balance(data1, data2, balance=False) data = my_transpose(data)[0] return get_random_forest(data, target, N_trees)
def join(datap1, datan1a, datan1b, datap2, datan2): #returns: datap1+datan1a, datap2+datan1b, datap2+datan2 with n=min(all lengths) samples n = min(datap1.shape[0], datap2.shape[0], datan1a.shape[0], datan1b.shape[0], datan2.shape[0]) pos_i = sample(range(min(datap1.shape[0], datap2.shape[0])), n) neg_i = sample(range(datan1a.shape[0]), n) neg_i2 = sample(range(datan2.shape[0]), n) #sys.stderr.write('%d %d %d %d' %(datap1.shape[0], datap2.shape[0], datan1.shape[0], datan2.shape[0])) data1 = cut_rows(datap1, pos_i) data1 = numpy.append(data1, cut_rows(datan1a, neg_i)) data2p = cut_rows(datap2, pos_i) data2 = numpy.append(data2p, cut_rows(datan2, neg_i2)) data1b = numpy.append(data2p, cut_rows(datan1b, neg_i)) print "data sizes", (data1.shape, data1b.shape, data2.shape ) #target = [1]*len_pos2 + [0]*len_neg2 return my_transpose(data1)[0], my_transpose(data1b)[0], my_transpose(data2)[0]
def predict(datapos, dataneg, class_dir, outdir): try: class_filename = glob(RESULTSPATH+class_dir+'/*')[0] except: assert True, 'No classifier in %s'%class_dir print "Predicting using classifier from ", class_filename classifier = pickle.load(open(class_filename)) data, target = train.join_and_balance(datapos, dataneg, False) data, names = my_transpose(data) predicted = classifier.predict_proba(data) roc = pyroc.ROCData([(target[i], predicted[i, 1],) for i in xrange(0, len(predicted))]) train.save_predictions([(i, predicted[i, 1],target[i]) for i in xrange(0, len(predicted))], outdir) print "AUC=", roc.auc() #plot roc ? return [roc.auc(), 0]
def predict(datapos, dataneg, class_dir, outdir): try: class_filename = glob(RESULTSPATH + class_dir + '/*')[0] except: assert True, 'No classifier in %s' % class_dir print "Predicting using classifier from ", class_filename classifier = pickle.load(open(class_filename)) data, target = train.join_and_balance(datapos, dataneg, False) data, names = my_transpose(data) predicted = classifier.predict_proba(data) roc = pyroc.ROCData([( target[i], predicted[i, 1], ) for i in xrange(0, len(predicted))]) train.save_predictions([(i, predicted[i, 1], target[i]) for i in xrange(0, len(predicted))], outdir) print "AUC=", roc.auc() #plot roc ? return [roc.auc(), 0]
def get_class(datapos, dataneg): data, target = train.join_and_balance(datapos, dataneg) data, names = my_transpose(data) clas = train.get_random_forest(data, target, N_trees) return clas