def eval_classifier(method, train_data, train_class, test_data, test_class, LM_params=get_ML_parameters(), positive_roc_index=1): global have_written_params_to_file if have_written_params_to_file is False: logging.info("Run settings for models:") logging.info(str(LM_params)) logging.info("First run method: " + str(method)) have_written_params_to_file = True # set classifier method if method == 'svm': clf = SVC(random_state=0, probability=True, **LM_params['svm']) else: clf = set_up_classifier(method, 0, LM_params) clf = OneVsRestClassifier(clf) clf.fit(train_data, train_class) probas_ = clf.predict_proba(test_data) preds = clf.predict(test_data) fpr, tpr, thresholds = roc_curve(test_class, probas_[:, positive_roc_index], pos_label=positive_roc_index) roc_auc = auc(fpr, tpr) roc_auc = max(roc_auc, 1 - roc_auc) return preds, roc_auc
def run_rfe_classifier(method, train_data, train_class, test_data, CV_=0, fraction_feat_to_keep=0.1, LM_params=get_ML_parameters()): global have_written_params_to_file if have_written_params_to_file is False: logging.info("Run settings for models:") logging.info(str(LM_params)) have_written_params_to_file = True clf = set_up_classifier(method, CV_, LM_params) if CV_ < 1 and method != 'svm': clf = OneVsRestClassifier(clf) # fit and predict based on whether cross validation is used if (CV_ > 1): step_elim = (1 - fraction_feat_to_keep) / CV_ rfecv = RFE(estimator=clf, step=step_elim, n_features_to_select=int(fraction_feat_to_keep * len(list(train_data)))) rfecv.fit(train_data, train_class) preds = rfecv.predict(test_data) else: clf.fit(train_data, train_class) preds = clf.predict(test_data) return preds
def rfe_classifier(method, train_data, train_class, test_data, CV_=3, fraction_feat_to_keep=0.1, LM_params=get_ML_parameters()): global have_written_params_to_file if have_written_params_to_file is False: logging.info("Run settings for models:") logging.info(str(LM_params)) have_written_params_to_file = True clf = set_up_classifier(method, CV_, LM_params) # fit and predict based on whether cross validation is used if (CV_ > 1): step_elim = (1 - fraction_feat_to_keep) / CV_ num_to_keep = int(fraction_feat_to_keep * len(list(train_data))) num_to_keep = max(num_to_keep, 1) rfecv = RFE(estimator=clf, step=step_elim, n_features_to_select=num_to_keep) rfecv.fit(train_data, train_class) preds = rfecv.predict(test_data) mask = list(rfecv.support_) # print("Number of features selected:", sum(mask)) #print(rfecv.ranking_) features = train_data.columns features_selected = [ features[i] for i in range(0, len(mask)) if mask[i] ] #print(features_selected) else: clf.fit(train_data, train_class) preds = clf.predict(test_data) return preds, features_selected, sum(mask)
def main(work_dir=None, model='rf', set_of_classes=(0, 1, 2, 3)): while work_dir is None or Path(work_dir).exists() is False: print("Unable to locate directory.") work_dir = input("Please enter working directory: ") # folder with features split by section work_dir = Path(work_dir) DATA_DIR = work_dir/'section_fm' GOLD_FILE = work_dir/'GOLD_multiclass.csv' ML_param_file = work_dir / 'data' / 'ML_model_settings' / 'ML_default_settings.json' if ML_param_file.exists(): params = get_ML_parameters(use_default=False, dict_path=ML_param_file) else: params = get_ML_parameters(use_default=True) logging.info("Loading Data from: " + str(DATA_DIR)) pathlist = Path(DATA_DIR).glob('*.csv') fm_by_section = {} lionc = [] sections_writen = defaultdict(bool) # default = false for path in pathlist: section_name = path.stem lionc.append(section_name) fm_by_section[section_name] = pd.read_csv(path,index_col=0) fm_by_section[section_name].fillna(0, inplace=True) if len(lionc) < 1: logging.error("No files found at: " + str(DATA_DIR)) exit() # load gold gold = pd.read_csv(GOLD_FILE,index_col=0) gold.fillna(0, inplace=True) tasks = [x for x in gold if x not in ['test','train']] frac_features_for_running_f1 = 0.01 #set the following to use either RFECV or RFE run_f1_with_rfecv = True logging.info("frac_features_for_running_f1: " + str(frac_features_for_running_f1) + " with CV?: " + str(run_f1_with_rfecv)) no_feature_elim = False # if run_f1_with_rfecv == False can try to run without Feature elim logging.info("list of sections found:") logging.info(str(lionc)) logging.info("model to run: " + str(model)) rfecv_top_features = {} NUM_SECT_TO_COMBINE = len(lionc) # add all sections sect_combinations = combinations(lionc,NUM_SECT_TO_COMBINE) for combo in sect_combinations: section_list = [] for section in combo: section_list.append(fm_by_section[section]) merged = combine_list_dfs(section_list) merged = normalize_df_columns(merged,0,tf=(lambda x: x ** (1/3))) train, test, features = rearrange_for_testing(merged, gold) p_avg, r_avg, f1_avg, f1_macro_avg = 0, 0, 0, 0 print("features:", len(features)) output_label_line = '%s %8s %8s %8s %8s %8s %8s' % ("Morbidity Results", "P-micro", "P-macro", "R-micro", "R-macro", "F1-micro", "F1-macro") logging.info(output_label_line) for task in tasks: train, test, features = rearrange_for_testing(merged, gold, task, set_of_classes) # filter features if desired features = [f for f in features if len(f)!=2] # features = [f for f in features if f[-1] != 'n'] if run_f1_with_rfecv: preds, feat_important,num_feat = rfecv_classifier(model, train_data=train[features], train_class=train[task], test_data=test[features], CV_=3, fraction_feat_to_keep=frac_features_for_running_f1, LM_params=params, save_model=True) rfecv_top_features[task] = feat_important elif no_feature_elim: clf = set_up_classifier(model, 0, LM_params=params) clf.fit(train[features], train[task]) preds = clf.predict(test[features]) else: preds, feat_important,num_feat = rfe_classifier(model, train_data=train[features], train_class=train[task].astype(int), test_data=test[features], CV_=10, fraction_feat_to_keep=frac_features_for_running_f1, LM_params=params) results = CalculatePerformance.calculate_metrics(list(test[task]), list(preds), set_of_classes,output_type='values') f1 = results[4] f1_macro = results[5] f1_avg += f1 /len(tasks) f1_macro_avg += f1_macro/len(tasks) results = CalculatePerformance.calculate_metrics(list(test[task]), list(preds), set_of_classes,output_type='values') logging.info("task: " + str(task) + ' ' + CalculatePerformance.calculate_metrics(list(test[task]), list(preds),set_of_classes,output_type='text').strip()) file_name = work_dir / 'models' / 'top_features.json' save_to_json(rfecv_top_features,file_name) logging.info("Averages: f1: %.6f, f1_macro: %.6f" % (f1_avg, f1_macro_avg))
def rfecv_classifier(method, train_data, train_class, test_data, CV_=3, fraction_feat_to_keep=0.1, LM_params=get_ML_parameters(), save_model=False): n_orig_features = len(list(train_data)) max_ratio_diff = 1.2 global have_written_params_to_file if have_written_params_to_file is False: logging.info("Run settings for models:") logging.info(str(LM_params)) have_written_params_to_file = True # set classifier method clf = set_up_classifier(method, CV_, LM_params) # fit and predict based on whether cross validation is used if (CV_ > 1): step_elim = (1 - fraction_feat_to_keep) / CV_ # Recursive feature elimination with Cross Validation # CV might have issues if data set classification is poorly balanced and can not split it properly try: rfecv = RFECV(estimator=clf, step=step_elim, cv=StratifiedKFold(n_splits=CV_, random_state=0), scoring='accuracy') rfecv.fit(train_data, train_class) preds = rfecv.predict(test_data) current_fraction_features = rfecv.n_features_ / n_orig_features if (current_fraction_features * max_ratio_diff < fraction_feat_to_keep): raise ValueError( "Not enough features kept by RFECV defaulting to RFE") except ValueError: rfecv = RFE(estimator=clf, step=step_elim, n_features_to_select=int(fraction_feat_to_keep * len(list(train_data)))) rfecv.fit(train_data, train_class) preds = rfecv.predict(test_data) mask = list(rfecv.support_) features = train_data.columns features_selected = [ features[i] for i in range(0, len(mask)) if mask[i] ] # sometimes RFECV does not eliminate enough features, so then lets run RFE to remove more if more than 20% over current_fraction_features = len(features_selected) / n_orig_features step_elim = (current_fraction_features - fraction_feat_to_keep) / CV_ if (current_fraction_features > max_ratio_diff * fraction_feat_to_keep) and step_elim > 0: rfecv = RFE(estimator=clf, step=step_elim, n_features_to_select=int(fraction_feat_to_keep * n_orig_features)) rfecv.fit(train_data[features_selected], train_class) preds = rfecv.predict(test_data[features_selected]) mask = list(rfecv.support_) features = train_data.columns features_selected = [ features[i] for i in range(0, len(mask)) if mask[i] ] else: clf.fit(train_data, train_class) preds = clf.predict(test_data) return preds, features_selected, sum(mask)