def run_xgboost_classifier(load=False, model_no=1): # para load: whether or not to load pre-trained model # para model_no: if load, which model to load data_RNASeq_labels = load_data_RNASeq(proc=False, label=False, raw_count=True) print(data_RNASeq_labels.iloc[1:3]) data_RNASeq_labels = data_RNASeq_labels.drop(columns=['gene']) data_labels = data_RNASeq_labels['label'] data_RNASeq = data_RNASeq_labels.drop(columns=['label']) # train/test split print("\nsplitting the training/test dataset ...") X_train, X_test, y_train, y_test = train_test_split( data_RNASeq, data_labels) if load: print("\nload pre-trained no.%d model" % model_no) xgb_model = load_model(model_no) else: print("\ntraining a XGBoost classifier ...") xgb_model = xgb.XGBClassifier(min_child_weight=MIN_CHILD_WEIGHT, gamma=G, subsample=SUBSAMPLE, n_estimators=NO_TREES, max_depth=MAX_DEPTH) xgbt_name = "min_child_weight=%s,gamma=%s,subample=%s,n_estimators=%s,max_depth=%s" % ( str(MIN_CHILD_WEIGHT), str(G), str(SUBSAMPLE), str(NO_TREES), str(MAX_DEPTH)) xgb_model.fit(X_train, y_train) print("\ntraining DONE. \n\nsaving the XGBoost classifer ...") save_model(xgb_model, xgbt_name) print("\ntesting the XGBoost classifier ...") y_pred = xgb_model.predict(X_test) print(mean_squared_error(y_test, y_pred)) print(confusion_matrix(y_test, y_pred)) # show & save the top 50 important features show_important_feature(xgb_model, data_RNASeq, img=False) # draw the precision recall curve for the classifier draw_precision_recall_curve(y_test, y_pred)
def run_random_forest(load=False, model_no=1): # para load: whether or not to load pre-trained model # para model_no: if load, which model to load data_RNASeq_labels = load_data_RNASeq(proc=False, label=False, raw_count=True) print(data_RNASeq_labels.iloc[1:3]) data_RNASeq_labels = data_RNASeq_labels.drop(columns=['gene']) data_labels = data_RNASeq_labels['label'] data_RNASeq = data_RNASeq_labels.drop(columns=['label']) # train/test split print("\nsplitting the training/test dataset ...") X_train, X_test, y_train, y_test = train_test_split( data_RNASeq, data_labels) if load: print("\nload pre-trained no.%d model" % model_no) forest = load_model(model_no) else: print("\ntraining a Random Forest classifier ...") forest = RandomForestClassifier(n_estimators=NO_TREES, random_state=0, max_features=MAX_FEATURES, criterion=CRITERION, n_jobs=NO_JOBS) forest_name = "n_estimators=%s,max_features=%s,criterion=%s,n_jobs=%s" % ( NO_TREES, MAX_FEATURES, CRITERION, NO_JOBS) forest.fit(X_train, y_train) print("\ntraining DONE.\n\nsaving the RF classifier ...") save_model(forest, forest_name) print("\ntesting the Random Forest classifier ...\n") y_pred = forest.predict(X_test) print("Accuracy on training set: %.3f" % forest.score(X_train, y_train)) print("Accuracy on test set: %.3f" % forest.score(X_test, y_test)) # show & save the top 50 important features show_important_feature(forest, data_RNASeq, img=False) # draw the precision recall curve for the classifier draw_precision_recall_curve(y_test, y_pred)
def run_gradient_boost(load=False, model_no=1): # para load: whether or not to load pre-trained model # para model_no: if load, which model to load data_RNASeq_labels = load_data_RNASeq(proc=False, label=False, raw_count=True) print(data_RNASeq_labels.iloc[1:3]) data_RNASeq_labels = data_RNASeq_labels.drop(columns=['gene']) data_labels = data_RNASeq_labels['label'] data_RNASeq = data_RNASeq_labels.drop(columns=['label']) # train/test split print("\nsplitting the training/test dataset ...") X_train, X_test, y_train, y_test = train_test_split( data_RNASeq, data_labels) if load: print("\nload pre-trained no.%d model" % model_no) gbrt = load_model(model_no) else: print("\ntraining a Gradient Boosting Tree classifier ...") gbrt = GradientBoostingClassifier(n_estimators=NO_TREES, random_state=0, max_features=MAX_FEATURES, max_depth=MAX_DEPTH, learning_rate=LEARNING_RATE) gbrt_name = "n_estimators=%s,max_features=%s,max_depth=%s,learning_rate=%s" % ( str(NO_TREES), MAX_FEATURES, str(MAX_DEPTH), str(LEARNING_RATE)) gbrt.fit(X_train, y_train) print("\ntraining DONE.\n\nsaving the GB classifier ...") save_model(gbrt, gbrt_name) print("\ntesting the Gradient Boosting Tree classifier ...\n") y_pred = gbrt.predict(X_test) print("Accuracy on training set: %.3f" % gbrt.score(X_train, y_train)) print("Accuracy on test set: %.3f" % gbrt.score(X_test, y_test)) # show & save the top 50 important features show_important_feature(gbrt, data_RNASeq, img=False) # draw the precision recall curve for the classifier draw_precision_recall_curve(y_test, y_pred)
def tune_hyperparameters(): # load the data data_RNASeq_labels = load_data_RNASeq(proc=False, label=False, raw_count=True) print(data_RNASeq_labels.iloc[1:3]) data_RNASeq_labels = data_RNASeq_labels.drop(columns=['gene']) data_labels = data_RNASeq_labels['label'] data_RNASeq = data_RNASeq_labels.drop(columns=['label']) # train/test split print("\nsplitting the training/test dataset ...") X_train, X_test, y_train, y_test = train_test_split( data_RNASeq, data_labels) parameters = { "n_estimators": [200, 400, 500, 600, 800], "max_features": ['log2', 'sqrt'], "max_depth": [3, 5, 8], "learning_rate": [0.05, 0.1, 0.2] } print( "\nrunning the Grid Search for Gradient Boosting Tree classifier ...") clf = GridSearchCV(GradientBoostingClassifier(), parameters, cv=2, n_jobs=NO_JOBS, verbose=10) clf.fit(X_train, y_train) print(clf.score(X_train, y_train)) print(clf.best_params_) # save tune hyperparameters with smart_open("./results/best_params_gradient_boost.txt", 'w', encoding='utf-8') as f: f.write(str(clf.best_params_) + str(clf.best_score_)) print("\nbest hyperparameters for GBRT has been written to file.")
def tune_hyperparameters(): # loda the data data_RNASeq_labels = load_data_RNASeq(proc=False, label=False, raw_count=True) print(data_RNASeq_labels.iloc[1:3]) data_RNASeq_labels = data_RNASeq_labels.drop(columns=['gene']) data_labels = data_RNASeq_labels['label'] data_RNASeq = data_RNASeq_labels.drop(columns=['label']) # train/test split print("\nsplitting the training/test dataset ...") X_train, X_test, y_train, y_test = train_test_split( data_RNASeq, data_labels) params = { "min_child_weight": [1, 5, 10], "gamma": [0.5, 1, 2], "subsample": [0.6, 0.8, 1.0], "max_depth": [3, 5], "n_estimators": [50, 200] } print("\nrunning the Grid Search for XGBoost classifier ...") clf = GridSearchCV(xgb.XGBClassifier(), params, cv=2, n_jobs=NO_JOBS, verbose=10) clf.fit(X_train, y_train) print(clf.best_score_) print(clf.best_estimator_) # save tuned hyperparameters with smart_open("./results/best_params_xgboost.txt", 'w', encoding='utf-8') as f: f.write(str(clf.best_params_) + str(clf.best_score_)) print("\nbest hyperparameters for XGBOOST has been written to file.")
def tune_hyperparameters(): # load the data data_RNASeq_labels = load_data_RNASeq(proc=False, label=False, raw_count=True) print(data_RNASeq_labels.iloc[1:3]) data_RNASeq_labels = data_RNASeq_labels.drop(columns=['gene']) data_labels = data_RNASeq_labels['label'] data_RNASeq = data_RNASeq_labels.drop(columns=['label']) # train/test split print("\nsplitting the training/test dataset ...") X_train, X_test, y_train, y_test = train_test_split( data_RNASeq, data_labels) parameters = { "n_estimators": [100, 200, 300, 400, 500], "max_features": [0.1, 0.2, 0.25, 0.3, 0.4, 0.5], "criterion": ["entropy"] } print("\nrunning the Grid Search for Random Forest classifier ...") clf = GridSearchCV(RandomForestClassifier(), parameters, cv=2, n_jobs=NO_JOBS, verbose=10) clf.fit(X_train, y_train) print(clf.score(X_train, y_train)) print(clf.best_params_) # save tuned hyperparameters with smart_open("./results/best_params_random_forest.txt", 'w', encoding='utf-8') as f: f.write(str(clf.best_params_) + str(clf.best_score_)) print("\nbest hyperparameters for RF has been written to file.")
def survival_analysis_with_all_RNASeq(model_type): # para model_type: data_RNASeq_labels = load_data_RNASeq() data_RNASeq_labels = data_RNASeq_labels.drop(columns=['gene']) feature_list = [] # list of gene signatures # load most important features (index and name) if model_type == 'rf': for line in smart_open(IMPORTANT_FEATURE_RANDOM_FOREST, 'r', encoding='utf-8'): line = line.split() feature_list.append((line[2], line[3])) elif model_type == 'gbrt': for line in smart_open(IMPORTANT_FEATURE_GRADIENT_BOOST, 'r', encoding='utf-8'): line = line.split() feature_list.append((line[2], line[3])) elif model_type == 'xgbt': for line in smart_open(IMPORTANT_FEATURE_XGBOOST, 'r', encoding='utf-8'): line = line.split() feature_list.append((line[2], line[3])) else: print( "\nPlease indicate the type of model you have trained to produce important features" ) log_p_values = [] for i in range(len(feature_list)): log_p_values.append( survival_analysis_with_one_RNASeq(model_type, data_RNASeq_labels, feature_list, i)) print(log_p_values) save_log_p_values(model_type, log_p_values)