def naive_bayes_k(k, sequence_origin='DairyDB', primers_origin='DairyDB', taxonomy_level: int = 1, selected_primer: str = 'V4', model_preprocessing='Computing frequency of {}-mer (ATCG) in every sequence', test_size=0.2): """ Apply Naive Bayes model on a set of sequence preprocessed data. :return: """ model_preprocessing = model_preprocessing.format(k) X_train, X_test, y_train, y_test = ETL_NB_k_mer(k=k, sequence_origin=sequence_origin, primers_origin=primers_origin, taxonomy_level=taxonomy_level, selected_primer=selected_primer) GNB = GaussianNB() y_pred = GNB.fit(X_train, y_train).predict(X_test) test_size, prop_main_class, accuracy = main_stats_model(y_train=y_train, y_test=y_test, y_pred=y_pred, model_name='Naive Bayes - NB({})'.format(k), model_parameters=GNB.get_params(), model_preprocessing=model_preprocessing, sequence_origin=sequence_origin, primers_origin=primers_origin, taxonomy_level=taxonomy_level, selected_primer=selected_primer, test_size=test_size) return test_size, prop_main_class, accuracy
def random_forest_k_default( k=4, sequence_origin='DairyDB', primers_origin='DairyDB', taxonomy_level: int = 1, selected_primer: str = 'V4', model_preprocessing='Computing frequency of {}-mer (ATCG) in every sequence', test_size=0.2, max_depth=None, n_estimators=None): """ Apply Random Forest model on a set of sequence preprocessed data. :return: """ model_preprocessing = model_preprocessing.format(k) X_train, X_test, y_train, y_test = ETL_RF_k_mer( k=k, sequence_origin=sequence_origin, primers_origin=primers_origin, taxonomy_level=taxonomy_level, selected_primer=selected_primer) if max_depth is None: if taxonomy_level >= 5: max_depth = 10 elif taxonomy_level >= 3 and selected_primer == 'sequence' and sequence_origin == '': max_depth = 20 else: max_depth = 50 if n_estimators is None: n_estimators = 200 RF = RandomForestClassifier(bootstrap=False, min_samples_leaf=1, min_samples_split=2, max_features=min(50, 4**k), n_estimators=n_estimators, max_depth=max_depth, n_jobs=-1) # 30 for max_depth is not backed-up y_pred = RF.fit(X_train, y_train).predict(X_test) test_size, prop_main_class, accuracy = main_stats_model( y_train=y_train, y_test=y_test, y_pred=y_pred, model_name='RF_{}'.format(k), model_parameters=RF.get_params(), model_preprocessing=model_preprocessing, sequence_origin=sequence_origin, primers_origin=primers_origin, taxonomy_level=taxonomy_level, selected_primer=selected_primer, test_size=test_size, k=k, feature_importances=RF.feature_importances_) return RF, test_size, prop_main_class, accuracy
def xgboost_k_grid_search_cv( k=4, param_grid=None, sequence_origin='DairyDB', primers_origin='DairyDB', taxonomy_level: int = 1, selected_primer: str = 'V4', model_preprocessing='Computing frequency of {}-mer (ATCG) in every sequence', test_size=0.2): """ Apply Random Forest model on a set of sequence preprocessed data. :return: """ model_preprocessing = model_preprocessing.format(k) X_train, X_test, y_train, y_test = ETL_k_mer( k=k, sequence_origin=sequence_origin, primers_origin=primers_origin, taxonomy_level=taxonomy_level, selected_primer=selected_primer) XGB = XGBClassifier() grid_search = GridSearchCV(estimator=XGB, param_grid=param_grid, cv=3, n_jobs=8, verbose=2) grid_search.fit(X_train, y_train) XGB_opt = grid_search.best_estimator_ y_pred = XGB_opt.fit(X_train, y_train).predict(X_test) test_size, prop_main_class, accuracy = main_stats_model( y_train=y_train, y_test=y_test, y_pred=y_pred, model_name='XGB_CV_{}'.format(k), model_parameters=grid_search.best_params_, model_preprocessing=model_preprocessing, sequence_origin=sequence_origin, primers_origin=primers_origin, taxonomy_level=taxonomy_level, selected_primer=selected_primer, test_size=test_size, feature_importances=XGB_opt.feature_importances_, k=k, save_csv=True, xgb_model=XGB_opt, save_model=True) return test_size, prop_main_class, accuracy
def xgboost_k_default( k=4, sequence_origin='DairyDB', primers_origin='DairyDB', taxonomy_level: int = 1, selected_primer: str = 'V4', model_preprocessing='Computing frequency of {}-mer (ATCG) in every sequence', test_size=0.2): """ Apply Random Forest model on a set of sequence preprocessed data. :return: """ model_preprocessing = model_preprocessing.format(k) X_train, X_test, y_train, y_test = ETL_k_mer( k=k, sequence_origin=sequence_origin, primers_origin=primers_origin, taxonomy_level=taxonomy_level, selected_primer=selected_primer) XGB = XGBClassifier(silent=0, eta=0.3, max_depth=3, n_estimators=100) y_pred = XGB.fit(X_train, y_train).predict(X_test) test_size, prop_main_class, accuracy = main_stats_model( y_train=y_train, y_test=y_test, y_pred=y_pred, model_name='XGB_{}'.format(k), model_parameters=XGB.get_params(), model_preprocessing=model_preprocessing, sequence_origin=sequence_origin, primers_origin=primers_origin, taxonomy_level=taxonomy_level, selected_primer=selected_primer, test_size=test_size, k=k, feature_importances=XGB.feature_importances_, xgb_model=XGB, save_model=True, save_tree=20) del XGB, X_train, X_test, y_train, y_test, y_pred return test_size, prop_main_class, accuracy
def random_forest_k_grid_search_cv( k=5, param_grid=None, sequence_origin='DairyDB', primers_origin='DairyDB', taxonomy_level: int = 1, selected_primer: str = 'V4', model_preprocessing='Computing frequency of {}-mer (ATCG) in every sequence', test_size=0.2): """ Apply Random Forest model on a set of sequence preprocessed data. :return: """ model_preprocessing = model_preprocessing.format(k) if param_grid is None: # Number of trees in random forest n_estimators = [200] # Checked as often the best option # Number of features to consider at every split max_features = ['auto'] # Checked as best option # Maximum number of levels in tree max_depth = [ None ] # Checked as best option -> Due to memory errors, limiting at 30 # Minimum number of samples required to split a node min_samples_split = [ 2 ] # Instead of 2, 5, 10 because of unbalanced classes # Minimum number of samples required at each leaf node min_samples_leaf = [ 1 ] # Instead of 1, 2, 4 because of unbalanced classes # Method of selecting samples for training each tree bootstrap = [False] # Checked as best option # Create the random grid param_grid = { 'n_estimators': n_estimators, 'max_features': max_features, 'max_depth': max_depth, 'min_samples_split': min_samples_split, 'boostrap': bootstrap } X_train, X_test, y_train, y_test = ETL_RF_k_mer( k=k, sequence_origin=sequence_origin, primers_origin=primers_origin, taxonomy_level=taxonomy_level, selected_primer=selected_primer) RF = RandomForestClassifier() grid_search = GridSearchCV(estimator=RF, param_grid=param_grid, cv=3, n_jobs=2, verbose=1) grid_search.fit(X_train, y_train) RF_opt = grid_search.best_estimator_ y_pred = RF_opt.fit(X_train, y_train).predict(X_test) test_size, prop_main_class, accuracy = main_stats_model( y_train=y_train, y_test=y_test, y_pred=y_pred, model_name='RF_CV_{}'.format(k), model_parameters=grid_search.best_params_, model_preprocessing=model_preprocessing, sequence_origin=sequence_origin, primers_origin=primers_origin, taxonomy_level=taxonomy_level, selected_primer=selected_primer, test_size=test_size, feature_importances=RF_opt.feature_importances_, save_model=True, rf_model=RF_opt, k=k, save_csv=True) return test_size, prop_main_class, accuracy