def get_cascaded_sel_idx(high_th_year, low_th_year, feature_list, set_feature, sel_feature_num, div_ratio=4): high_risk_th = high_th_year * 365 low_risk_th = low_th_year * 365 high_risk_group, low_risk_group = helper.get_risk_group( x, c, s, high_risk_th, low_risk_th) #trn_x, trn_y, val_x, val_y = get_train_val(high_risk_group, low_risk_group) trn_x, trn_y = helper.get_train( high_risk_group, low_risk_group, is_categori_y=False, seed=self.random_seed) #without validation set if len(set_feature): trn_x = trn_x[:, set_feature] #val_x = val_x[:,set_feature] feature_num = trn_x.shape[1] if sel_feature_num == 0: sel_gene_num = int( max(sel_feature_num, feature_num / div_ratio)) else: sel_gene_num = sel_feature_num clf = RandomForestClassifier() clf.fit(trn_x, trn_y) f_scores = clf.feature_importances_ coef_idx_sort = np.argsort(f_scores)[::-1] sel_idx = coef_idx_sort[:sel_gene_num] return sel_idx
def get_cascaded_sel_idx(high_th_year, low_th_year, feature_list, set_feature, sel_feature_num, div_ratio=4): high_risk_th = high_th_year * 365 low_risk_th = low_th_year * 365 high_risk_group, low_risk_group = helper.get_risk_group( x, c, s, high_risk_th, low_risk_th) #trn_x, trn_y, val_x, val_y = get_train_val(high_risk_group, low_risk_group) trn_x, trn_y = helper.get_train( high_risk_group, low_risk_group, is_categori_y=False, seed=self.random_seed) #without validation set if len(set_feature): trn_x = trn_x[:, set_feature] #val_x = val_x[:,set_feature] feature_num = trn_x.shape[1] if sel_feature_num == 0: sel_gene_num = int( max(sel_feature_num, feature_num / div_ratio)) else: sel_gene_num = sel_feature_num sort_idx = trace_ratio.trace_ratio(trn_x, trn_y, mode='index') sel_idx = sort_idx[:sel_gene_num] return sel_idx
def get_sel_idx(high_th_year, low_th_year, feature_list, sel_feature_num): high_risk_th = high_th_year*365 low_risk_th = low_th_year*365 high_risk_group, low_risk_group = helper.get_risk_group(x,c,s,high_risk_th,low_risk_th) trn_x, trn_y = helper.get_train(high_risk_group, low_risk_group, is_categori_y=False, seed=self.random_seed)#without validation set print('Into RFS fs...') sort_idx = RFS.rfs(trn_x, trn_y, mode='index', verbose=True) print('RFS fs done...') return sort_idx[:sel_feature_num]
def get_sel_idx(high_th_year, low_th_year, feature_list, sel_feature_num): high_risk_th = high_th_year*365 low_risk_th = low_th_year*365 high_risk_group, low_risk_group = helper.get_risk_group(x,c,s,high_risk_th,low_risk_th) trn_x, trn_y, val_x, val_y = helper.get_train_val(high_risk_group, low_risk_group, is_categori_y=False, seed=self.random_seed)#with validation set clf = xgb.XGBClassifier(seed=1, objective='binary:logistic') clf.fit(trn_x, trn_y, eval_set=[(val_x, val_y)], verbose=False, eval_metric='auc', early_stopping_rounds=200) f_scores = clf.feature_importances_ coef_idx_sort = np.argsort(f_scores)[::-1] return coef_idx_sort[:sel_feature_num]
def get_sel_idx(high_th_year, low_th_year, feature_list, sel_feature_num): high_risk_th = high_th_year * 365 low_risk_th = low_th_year * 365 high_risk_group, low_risk_group = helper.get_risk_group( x, c, s, high_risk_th, low_risk_th) trn_x, trn_y = helper.get_train( high_risk_group, low_risk_group, is_categori_y=False, seed=self.random_seed) #without validation set sort_idx = reliefF.reliefF(trn_x, trn_y, mode='index') return sort_idx[:sel_feature_num]
def get_sel_idx(high_th_year, low_th_year, feature_list, sel_feature_num): high_risk_th = high_th_year*365 low_risk_th = low_th_year*365 high_risk_group, low_risk_group = helper.get_risk_group(x,c,s,high_risk_th,low_risk_th) trn_x, trn_y = helper.get_train(high_risk_group, low_risk_group, is_categori_y=False, seed=self.random_seed)#without validation set svm_clf = svm.SVC(kernel='linear') svm_clf.fit(trn_x, trn_y) svm_coef = svm_clf.coef_ svm_coef = np.abs(np.mean(svm_coef, axis=0)) svm_coef_idx_sort = np.argsort(svm_coef)[::-1] return svm_coef_idx_sort[:sel_feature_num]
def get_wx_sel_idx(high_th_year, low_th_year, feature_list, set_feature, sel_feature_num, sel_op, div_ratio=4): high_risk_th = high_th_year * 365 low_risk_th = low_th_year * 365 high_risk_group, low_risk_group = helper.get_risk_group( x, c, s, high_risk_th, low_risk_th) trn_x, trn_y, val_x, val_y = helper.get_train_val( high_risk_group, low_risk_group, is_categori_y=True, seed=self.random_seed) if len(set_feature): trn_x = trn_x[:, set_feature] val_x = val_x[:, set_feature] feature_num = trn_x.shape[1] if sel_feature_num == 0: hp = WxHyperParameter(epochs=50, learning_ratio=0.01, batch_size=int(len(trn_x) / 4), verbose=True) sel_gene_num = int( max(sel_feature_num, feature_num / div_ratio)) else: hp = WxHyperParameter(epochs=50, learning_ratio=0.001, batch_size=int(len(trn_x) / 4), verbose=True) sel_gene_num = sel_feature_num sel_idx, sel_genes, sel_weight, test_auc = DoFeatureSelectionWX( trn_x, trn_y, val_x, val_y, val_x, val_y, feature_list, hp, n_sel=sel_gene_num, sel_option=sel_op) return sel_idx
def get_sel_idx(high_th_year, low_th_year, feature_list, sel_feature_num): high_risk_th = high_th_year * 365 low_risk_th = low_th_year * 365 high_risk_group, low_risk_group = helper.get_risk_group( x, c, s, high_risk_th, low_risk_th) trn_x, trn_y = helper.get_train( high_risk_group, low_risk_group, is_categori_y=False, seed=self.random_seed) #without validation set clf = RandomForestClassifier() clf.fit(trn_x, trn_y) f_scores = clf.feature_importances_ coef_idx_sort = np.argsort(f_scores)[::-1] return coef_idx_sort[:sel_feature_num]
def get_sel_idx(high_th_year, low_th_year, feature_list, sel_feature_num): high_risk_th = high_th_year * 365 low_risk_th = low_th_year * 365 high_risk_group, low_risk_group = helper.get_risk_group( x, c, s, high_risk_th, low_risk_th) trn_x, trn_y = helper.get_train( high_risk_group, low_risk_group, is_categori_y=False, seed=self.random_seed) #without validation set W, _, _ = ll_l21.proximal_gradient_descent(trn_x, trn_y, z=0.01, mode='raw') sort_idx = feature_ranking(W) return sort_idx[:sel_feature_num]