def run(training, cross_v, perc_test, file_output, n_trees=100, nmodels=1, max_features = 20, cpus=-1 ): base_clf=RandomForestClassifier(n_estimators= n_trees , class_weight="balanced", oob_score=True ); samples, groups, dimensions, values = read_kmer_matrix(training) classnames, Y = np.unique(groups, return_inverse=True) n_dim=values.shape[1] if n_dim < max_features : max_features = n_dim groupcount = np.bincount(Y) print("Starting the creation of {} models.".format(nmodels)) print("Original data have {} dimensions with {} samples, divided in {} groups:".format(n_dim, values.shape[0],len(classnames))) for c in range(0, len(classnames)) : print("\t{} - {} \t {} samples".format(c, classnames[c] ,groupcount[c])) ofs=open(file_output, "w") ofs.write("round\tfeatures\taccuracy\ttrain_accuracy\toob_score\n") for i in range(0, nmodels): print("\n\nRound {}".format(i), flush=True) fsel = SelectFromModel(DecisionTreeClassifier(), max_features=max_features, threshold=-np.inf) fsel.fit(values, Y) for j in range(2, max_features+1): fsel.max_features=j clf=clone(base_clf) tmp_acc = cross_validate(clf, fsel.transform(values), Y, scoring="balanced_accuracy", cv=StratifiedShuffleSplit(n_splits=cross_v, test_size=perc_test), n_jobs=cpus, return_train_score=True) acc = np.mean( tmp_acc["test_score"]) acc_train = np.mean(tmp_acc["train_score"] ) clf.fit(fsel.transform(values), Y) toprint="{}\t{}\t{}\t{}\t{}".format(i, j, acc, acc_train, clf.oob_score_ ) ofs.write(toprint+"\n") print(toprint, flush=True) return;
def run(training, test, file_output, n_trees=100, nmodels=1, max_features=20, cpus=-1): base_clf = RandomForestClassifier(n_estimators=n_trees, class_weight="balanced", oob_score=True) samples, groups, dimensions, values = read_kmer_matrix(training) test_samples, test_groups, test_dimensions, test_values = read_kmer_matrix( test) support = [] for d in dimensions: try: support.append(test_dimensions.index(d)) except: print("ERROR! Feature {} is not in the input matrix.".format(d)) exit(1) test_values = test_values[:, support] classnames, Y = np.unique(groups, return_inverse=True) Y_test = np.zeros(test_groups.shape, dtype=int) for i in range(0, len(classnames)): Y_test[test_groups == classnames[i]] = i n_dim = values.shape[1] groupcount = np.bincount(Y) print("Starting the creation of {} models.".format(nmodels)) print( "Original data have {} dimensions with {} samples, divided in {} groups:" .format(n_dim, values.shape[0], len(classnames))) for c in range(0, len(classnames)): print("\t{} - {} \t {} samples".format(c, classnames[c], groupcount[c])) ofs = open(file_output, "w") for i in range(0, nmodels): print("\n\nRound {}".format(i), flush=True) fsel = SelectFromModel(DecisionTreeClassifier(), max_features=max_features, threshold=-np.inf) fsel.fit(values, Y) for j in range(2, max_features + 1): fsel.max_features = j clf = clone(base_clf) clf.fit(fsel.transform(values), Y) acc = balanced_accuracy_score( Y_test, clf.predict(fsel.transform(test_values))) toprint = "{}\t{}\t{}\t{}".format(i, j, acc, clf.oob_score_) ofs.write(toprint + "\n") print(toprint, flush=True) return