示例#1
0
def run(training, cross_v, perc_test, file_output, n_trees=100, nmodels=1, max_features = 20, cpus=-1 ):
    base_clf=RandomForestClassifier(n_estimators= n_trees , class_weight="balanced", oob_score=True );
    samples, groups, dimensions, values = read_kmer_matrix(training)
    classnames, Y = np.unique(groups, return_inverse=True)
    n_dim=values.shape[1]
    if n_dim < max_features :
        max_features = n_dim
    groupcount = np.bincount(Y)
    print("Starting the creation of {} models.".format(nmodels))
    print("Original data have {} dimensions with {} samples, divided in {} groups:".format(n_dim, values.shape[0],len(classnames)))
    for c in range(0, len(classnames)) :
        print("\t{} - {} \t {} samples".format(c, classnames[c] ,groupcount[c])) 
    ofs=open(file_output, "w")
    ofs.write("round\tfeatures\taccuracy\ttrain_accuracy\toob_score\n")
    for i in range(0, nmodels):
        print("\n\nRound {}".format(i), flush=True)
        fsel = SelectFromModel(DecisionTreeClassifier(), max_features=max_features, threshold=-np.inf)
        fsel.fit(values, Y)
        for j in range(2, max_features+1):
            fsel.max_features=j
            clf=clone(base_clf)
            tmp_acc =  cross_validate(clf, fsel.transform(values), Y, scoring="balanced_accuracy", cv=StratifiedShuffleSplit(n_splits=cross_v, test_size=perc_test), n_jobs=cpus,  return_train_score=True)
            acc = np.mean( tmp_acc["test_score"])
            acc_train = np.mean(tmp_acc["train_score"] )
            clf.fit(fsel.transform(values), Y)
            toprint="{}\t{}\t{}\t{}\t{}".format(i, j, acc, acc_train, clf.oob_score_ )
            ofs.write(toprint+"\n")
            print(toprint, flush=True)
    return;
示例#2
0
def run(training,
        test,
        file_output,
        n_trees=100,
        nmodels=1,
        max_features=20,
        cpus=-1):
    base_clf = RandomForestClassifier(n_estimators=n_trees,
                                      class_weight="balanced",
                                      oob_score=True)
    samples, groups, dimensions, values = read_kmer_matrix(training)
    test_samples, test_groups, test_dimensions, test_values = read_kmer_matrix(
        test)
    support = []
    for d in dimensions:
        try:
            support.append(test_dimensions.index(d))
        except:
            print("ERROR! Feature {} is not in the input matrix.".format(d))
            exit(1)
    test_values = test_values[:, support]
    classnames, Y = np.unique(groups, return_inverse=True)
    Y_test = np.zeros(test_groups.shape, dtype=int)
    for i in range(0, len(classnames)):
        Y_test[test_groups == classnames[i]] = i
    n_dim = values.shape[1]
    groupcount = np.bincount(Y)
    print("Starting the creation of {} models.".format(nmodels))
    print(
        "Original data have {} dimensions with {} samples, divided in {} groups:"
        .format(n_dim, values.shape[0], len(classnames)))
    for c in range(0, len(classnames)):
        print("\t{} - {} \t {} samples".format(c, classnames[c],
                                               groupcount[c]))
    ofs = open(file_output, "w")
    for i in range(0, nmodels):
        print("\n\nRound {}".format(i), flush=True)
        fsel = SelectFromModel(DecisionTreeClassifier(),
                               max_features=max_features,
                               threshold=-np.inf)
        fsel.fit(values, Y)
        for j in range(2, max_features + 1):
            fsel.max_features = j
            clf = clone(base_clf)
            clf.fit(fsel.transform(values), Y)
            acc = balanced_accuracy_score(
                Y_test, clf.predict(fsel.transform(test_values)))
            toprint = "{}\t{}\t{}\t{}".format(i, j, acc, clf.oob_score_)
            ofs.write(toprint + "\n")
            print(toprint, flush=True)
    return