Exemplo n.º 1
0
def train_cv(datapos, dataneg, name, outdir, boruta_arg):
    len_neg = len(dataneg)
    auc = 0
    cut_at = 0.0
    
    for x in xrange(N_repeats):
        if len_neg > len(datapos)*(N_repeats-1):
            print "splitting negatives into %d groups" % N_repeats
            pocz = int(round(x*len_neg/N_repeats))
            kon = int(round((x+1)*len_neg/N_repeats))
            data, target = train.join_and_balance(datapos, cut_rows(dataneg, range(pocz,kon)))
        else:
            data, target = train.join_and_balance(datapos, dataneg)
        data, names = my_transpose(data)
        result = train.do_cross_validation(data, cv_folds, target, "random_forest", N_trees, name, outdir)
        auc += result[0]
        cut_at += result[1]
    
    
    
    print "Mean AUC in %d repeats = %f" % (N_repeats, auc/N_repeats)
    print "Mean cut value = %f" % (cut_at/N_repeats)
    
    
    
    sys.stdout.flush()
    if boruta_arg:
        boruta.run_boruta(data, target, names, name, outdir)
    return auc/N_repeats, cut_at/N_repeats
Exemplo n.º 2
0
def train_cv(datapos, dataneg, name, outdir, boruta_arg):
    len_neg = len(dataneg)
    auc = 0
    cut_at = 0.0

    for x in xrange(N_repeats):
        if len_neg > len(datapos) * (N_repeats - 1):
            print "splitting negatives into %d groups" % N_repeats
            pocz = int(round(x * len_neg / N_repeats))
            kon = int(round((x + 1) * len_neg / N_repeats))
            data, target = train.join_and_balance(
                datapos, cut_rows(dataneg, range(pocz, kon)))
        else:
            data, target = train.join_and_balance(datapos, dataneg)
        data, names = my_transpose(data)
        result = train.do_cross_validation(data, cv_folds, target,
                                           "random_forest", N_trees, name,
                                           outdir)
        auc += result[0]
        cut_at += result[1]

    print "Mean AUC in %d repeats = %f" % (N_repeats, auc / N_repeats)
    print "Mean cut value = %f" % (cut_at / N_repeats)

    sys.stdout.flush()
    if boruta_arg:
        boruta.run_boruta(data, target, names, name, outdir)
    return auc / N_repeats, cut_at / N_repeats
Exemplo n.º 3
0
def train_save(datapos, dataneg, name, outdir):
    data, target = train.join_and_balance(datapos, dataneg)
    data, names = my_transpose(data)

    auc = train.save_class(data, target, "random_forest", N_trees, name,
                           outdir)
    sys.stdout.flush()
Exemplo n.º 4
0
def predict_cv_bothpromoters(pos1, neg1a, neg1b, pos2a, neg2a, pos2b, neg2b):
    #pos1 and pos2 are ordered, each sequence pos1[i] corresponds to i-th sequence in pos2[i]
    print pos1.shape[0], pos2a.shape[0], neg1a.shape[0], neg2a.shape[0]
    len_n = min(pos1.shape[0], pos2a.shape[0], neg1a.shape[0], neg2a.shape[0])
    target = [1]*len_n + [0]*len_n
    
    data1a, data1b, data2 = join(pos1, neg1a, neg1b, pos2a, neg2a)
    
    kf = cross_validation.StratifiedKFold(target, cv_folds)
    predictions_pos = [None]*len_n
    predictions_neg = [None]*len_n
    
    

    for train_index, test_index in kf:
        data_train = data1a[[train_index]]
        data_test = data1a[[test_index]]
        data_test2 = data1b[[test_index]]
        
        target_train = [target[index] for index in train_index]
        target_test = [target[index] for index in test_index]
        
        npos = sum([1 for i in train_index if target[i] == 1])
        nneg = sum([1 for i in train_index if target[i] == 0])
        print npos, nneg, pos2b.shape[0], neg2b.shape[0]
        
        
        if npos > pos2b.shape[0] or nneg > neg2b.shape[0]:
            nsamples = min(pos2b.shape[0], neg2b.shape[0])
            new_train_index =  sample(train_index, nsamples)
            data_train2 = data2[[new_train_index]]
            data_train2b, _ = join_and_balance(pos2b[[sample(range(pos2b.shape[0]), nsamples)]], neg2b[[sample(range(neg2b.shape[0]), nsamples)]], balance=False)
            target_train2 =  [target[index] for index in new_train_index] +[1]*nsamples+[0]*nsamples
        else:
            data_train2 = data2[[train_index]]
            data_train2b, _ = join_and_balance(pos2b[[sample(range(pos2b.shape[0]), npos)]], neg2b[[sample(range(neg2b.shape[0]), nneg)]], balance=False)
            target_train2 = target_train +[1]*npos+[0]*nneg
        
        data_train2b = my_transpose(data_train2b)[0]
        data_train2 = numpy.append(data_train2, data_train2b, axis=0)
        
        predicted = train_random_forest(data_train, target_train, data_test, N_trees)
        predicted2 = train_random_forest(data_train2, target_train2, data_test2, N_trees)
        
        #print [x[1] for x in predicted]
        #print [x[1] for x in predicted2]
        predicted3 = combine_predictions(predicted, predicted2)
        #predicted3 = predicted   #one step!
        
        for x in xrange(len(test_index)):
            if target[test_index[x]] == 1:
                predictions_pos[test_index[x]] = predicted3[x][1]
                #print '\t'.join(map(str, ['pos', x, test_index[x], target[test_index[x]], predicted[x,1], predicted2[x,1], predicted3[x][1]]))
            else:
                #print '\t'.join(map(str, ['neg', x, test_index[x], target[test_index[x]], predicted[x,1], predicted2[x,1], predicted3[x][1]]))
                predictions_neg[test_index[x]-len_n] = predicted3[x][1]

    return predictions_pos, predictions_neg
Exemplo n.º 5
0
def getclass_bothpromoters(pos2a, neg2a, pos2b, neg2b):
    n = min(pos2a.shape[0], pos2b.shape[0], neg2a.shape[0], neg2b.shape[0])
    target = [1]*n*2 + [0]*n*2
    
    data1, _ = join_and_balance(choose(pos2a, n), choose(pos2b, n), balance=False)
    data2, _ = join_and_balance(choose(neg2a, n), choose(neg2b, n), balance=False)
    data, _ = join_and_balance(data1, data2, balance=False)
    data = my_transpose(data)[0]
    return get_random_forest(data, target, N_trees)
Exemplo n.º 6
0
def join(datap1, datan1a, datan1b, datap2, datan2):
    #returns: datap1+datan1a, datap2+datan1b, datap2+datan2 with n=min(all lengths) samples
    n = min(datap1.shape[0], datap2.shape[0], datan1a.shape[0], datan1b.shape[0], datan2.shape[0])
    
    pos_i = sample(range(min(datap1.shape[0], datap2.shape[0])), n)
    neg_i = sample(range(datan1a.shape[0]), n)
    neg_i2 = sample(range(datan2.shape[0]), n)
    
    #sys.stderr.write('%d %d %d %d' %(datap1.shape[0], datap2.shape[0], datan1.shape[0], datan2.shape[0]))
    
    data1 = cut_rows(datap1, pos_i)
    data1 = numpy.append(data1, cut_rows(datan1a, neg_i))
    
    data2p = cut_rows(datap2, pos_i)
    data2 = numpy.append(data2p, cut_rows(datan2, neg_i2))
    
    data1b = numpy.append(data2p, cut_rows(datan1b, neg_i))
    print "data sizes", (data1.shape, data1b.shape, data2.shape )
    #target = [1]*len_pos2 + [0]*len_neg2
    
    
    return my_transpose(data1)[0], my_transpose(data1b)[0], my_transpose(data2)[0]
Exemplo n.º 7
0
def predict(datapos, dataneg, class_dir, outdir):
    
    try:
        class_filename = glob(RESULTSPATH+class_dir+'/*')[0]
    except:
        assert True, 'No classifier in %s'%class_dir
        
    print "Predicting using classifier from ", class_filename
    classifier = pickle.load(open(class_filename))
    
    data, target = train.join_and_balance(datapos, dataneg, False)
    data, names = my_transpose(data)
    predicted = classifier.predict_proba(data)
    roc = pyroc.ROCData([(target[i], predicted[i, 1],) for i in xrange(0, len(predicted))])
    
    train.save_predictions([(i, predicted[i, 1],target[i]) for i in xrange(0, len(predicted))], outdir)
    
    print "AUC=", roc.auc()
    #plot roc ?
    return [roc.auc(), 0]
Exemplo n.º 8
0
def predict(datapos, dataneg, class_dir, outdir):

    try:
        class_filename = glob(RESULTSPATH + class_dir + '/*')[0]
    except:
        assert True, 'No classifier in %s' % class_dir

    print "Predicting using classifier from ", class_filename
    classifier = pickle.load(open(class_filename))

    data, target = train.join_and_balance(datapos, dataneg, False)
    data, names = my_transpose(data)
    predicted = classifier.predict_proba(data)
    roc = pyroc.ROCData([(
        target[i],
        predicted[i, 1],
    ) for i in xrange(0, len(predicted))])

    train.save_predictions([(i, predicted[i, 1], target[i])
                            for i in xrange(0, len(predicted))], outdir)

    print "AUC=", roc.auc()
    #plot roc ?
    return [roc.auc(), 0]
Exemplo n.º 9
0
def train_save(datapos, dataneg, name, outdir):
    data, target = train.join_and_balance(datapos, dataneg)
    data, names = my_transpose(data)
    
    auc = train.save_class(data, target, "random_forest", N_trees, name, outdir)
    sys.stdout.flush()
Exemplo n.º 10
0
def get_class(datapos, dataneg):
    data, target = train.join_and_balance(datapos, dataneg)
    data, names = my_transpose(data)
    clas = train.get_random_forest(data, target, N_trees)
    return clas
Exemplo n.º 11
0
def get_class(datapos, dataneg):
    data, target = train.join_and_balance(datapos, dataneg)
    data, names = my_transpose(data)
    clas = train.get_random_forest(data, target, N_trees)
    return clas