def run(self, df, target_label): target = df[target_label] feature = df.drop(target_label, axis=1) clf = RandomizedLogisticRegression() for col in feature.columns: if np.any(np.isnan(feature[col].values)) or np.any( np.isinf(feature[col].values)): print(list(feature[col].values)) try: clf.fit(feature.values, target.values) except: for col in feature.columns: print(list(feature[col].values)) scores = {} for col_index in range(len(feature.columns)): scores[feature.columns[col_index]] = abs(clf.scores_[col_index]) scores = sorted(scores.items(), key=lambda x: x[1], reverse=True) print(scores) position = {} i = 0 for col, _ in scores: position[col] = i i += 1 print(position) return position
def randomized_Logistic_regression(self): X = self.data[:, 1:len(self.data[0])] y = self.data[:, 0] randomized_logistic = RandomizedLogisticRegression() randomized_logistic.fit(X, y) a = randomized_logistic.get_support() selected = np.where(a)
def _get_clfs(self): clf_dict = {"rlrclf": RandomizedLogisticRegression(), "rfclf": RandomForestClassifier(criterion='entropy'), "dtrclf": DecisionTreeClassifier(criterion='entropy'), "lrclf": LogisticRegression() } return clf_dict
def statiblity(X, Y): from sklearn.linear_model import RandomizedLogisticRegression clf = RandomizedLogisticRegression(random_state=1) clf.fit(X, Y) return clf.scores_
def Feature_sort(Feat_scale, Label, threads=4): ##通过三种特征选择方法对特征进行排序 ranks = {} ## Univariate feature selection Selector = SelectKBest(f_classif, k='all') Selector.fit_transform(Feat_scale, Label) ranks["Univariate_f"] = np.argsort(Selector.pvalues_) ## RandomizedLogistic regression n_jobs=**s, more robust result from bigger n_resampling ##从第1900左右起,后续的特征排序得较为可疑。 rlogreg = RandomizedLogisticRegression(n_jobs=1, n_resampling=2000, selection_threshold=0, verbose=False, random_state=0) ##DeprecationWarning: Class RandomizedLogisticRegression is deprecated; The class RandomizedLogisticRegression is deprecated in 0.19 and will be removed in 0.21. ##warnings.warn(msg, category=DeprecationWarning) rlogreg.fit(Feat_scale, Label) ranks["Randomized_Logistic_f"] = np.argsort(-abs(rlogreg.scores_)) ## boruta based on randomforest n_jobs=** rf = RandomForestClassifier(random_state=0, n_jobs=threads, max_features='auto') feat_selector = BorutaPy(rf, n_estimators='auto', perc=80, random_state=0) feat_selector.fit(Feat_scale, Label) ranks["Boruta_f"] = np.argsort(feat_selector.ranking_) return (ranks)
def get_feature_selection_model_from_name(type_of_estimator, model_name): model_map = { 'classifier': { 'SelectFromModel': SelectFromModel(RandomForestClassifier(n_jobs=-1, max_depth=10, n_estimators=15), threshold='20*mean'), 'RFECV': RFECV(estimator=RandomForestClassifier(n_jobs=-1), step=0.1), 'GenericUnivariateSelect': GenericUnivariateSelect(), 'RandomizedSparse': RandomizedLogisticRegression(), 'KeepAll': 'KeepAll' }, 'regressor': { 'SelectFromModel': SelectFromModel(RandomForestRegressor(n_jobs=-1, max_depth=10, n_estimators=15), threshold='0.7*mean'), 'RFECV': RFECV(estimator=RandomForestRegressor(n_jobs=-1), step=0.1), 'GenericUnivariateSelect': GenericUnivariateSelect(), 'RandomizedSparse': RandomizedLasso(), 'KeepAll': 'KeepAll' } } return model_map[type_of_estimator][model_name]
def get_feature_selection_model_from_name(type_of_estimator, model_name): # TODO(PRESTON): eventually let threshold be user-configurable (or grid_searchable) # TODO(PRESTON): optimize the params used here model_map = { 'classifier': { 'SelectFromModel': SelectFromModel(RandomForestClassifier(n_jobs=-1)), 'RFECV': RFECV(estimator=RandomForestClassifier(n_jobs=-1), step=0.1), 'GenericUnivariateSelect': GenericUnivariateSelect(), 'RandomizedSparse': RandomizedLogisticRegression(), 'KeepAll': 'KeepAll' }, 'regressor': { 'SelectFromModel': SelectFromModel(RandomForestRegressor(n_jobs=-1)), 'RFECV': RFECV(estimator=RandomForestRegressor(n_jobs=-1), step=0.1), 'GenericUnivariateSelect': GenericUnivariateSelect(), 'RandomizedSparse': RandomizedLasso(), 'KeepAll': 'KeepAll' } } return model_map[type_of_estimator][model_name]
def feature_selection_class(predictors, responses, test_predictors, selectFeatTech): if (selectFeatTech == 0): #t=int(predictors.shape[1]*0.40) t = 500 # no of features you want to select model = SelectKBest(chi2, k=t).fit(predictors.replace(-1, 0), responses) #print model.scores_ predictors_new = model.transform(predictors) predictors_test_new = model.transform(test_predictors) indices = model.get_support(indices=True) if (selectFeatTech == 1): randomized_logistic = RandomizedLogisticRegression() model = randomized_logistic.fit(predictors, responses) predictors_new = model.transform(predictors) predictors_test_new = model.transform(test_predictors) indices = model.get_support(indices=True) column_names = predictors.columns[indices] predictors_new = pd.DataFrame(predictors_new, index=predictors.index, columns=column_names) predictors_test_new = pd.DataFrame(predictors_test_new, index=test_predictors.index, columns=column_names) return predictors_new, predictors_test_new
def lasso_regression(X, y): """ Use Randomized Logistic Regression to select the features based on the coefficient values """ clf = RandomizedLogisticRegression(C=1.0) clf.fit(X, y) print('Number of non zero valued coefficients: ', np.sum(clf.scores_ > 0)) imp_feature_idx = clf.scores_.argsort() qualities = [] X_train, X_test, y_train, y_test = split_examples(X, y) for i in range(0, 100, 4): clf = LogisticRegression(C=0.1) clf.fit(X_train[:, imp_feature_idx[i:]], y_train) q = roc_auc_score( y_test, clf.predict_proba(X_test[:, imp_feature_idx[i:]])[:, 1]) qualities.append(q) plt.plot(range(0, 100, 4), qualities) plt.show() return qualities
def build_classifier(definition, datas): if definition['classification'] == 'lr': classifier = LogisticRegression(C=1.5) elif definition['classification'] == 'sgd': classifier = SGDClassifier(alpha=0.0001, n_iter=10**2) elif definition['classification'] == 'sgd_grid': best_params = grid_search_params(datas) classifier = SGDClassifier(n_iter=10**2, **best_params) rlr_feature_selection = RandomizedLogisticRegression(C=1.5, n_jobs=-1, verbose=0) # Standard sklearn classifier clf = Pipeline([ # ('string_encoder', pp_encode_strings), # ('drop_nan_cols', pp_drop_nan_cols), # ('fix_collinear', pp_fix_collinear), # # ('float_imputer', pp_imputer), # ('scaler', pp_scaler), # ('feature_selection', rlr_feature_selection), ('classification', classifier) ]) return clf
def randlogistic(self, selection_threshold=0.25, sample_fraction=0.75): rlr_model = RandomizedLogisticRegression( C=self.C, selection_threshold=selection_threshold, normalize=False, sample_fraction=sample_fraction) rlr_model.fit(self.data.values, self.target.values) return rlr_model
def evaluate_stability(vocab, id_to_vec, mesh_to_id): labels = ('Male', 'Female', 'Both') Xs, ids = get_basic_Xs(id_to_vec, mesh_to_id, shuffle=True) Xtr, Ytr, Itr, Xte, Yte, Ite = get_test_train(labels, ids, Xs, 5) print 'Fitting RandomizedLR...' logreg = RandomizedLogisticRegression(verbose=True, n_resampling=1000, n_jobs=16) logreg.fit(Xtr, Ytr) scores = logreg.scores_ return {vocab[i]: score for i, score in enumerate(scores)}
def get_features(X_train, y_train, names, selection_threshold=0.2): print('\ngetting features with randomized logistic regression...') print('using a selection threshold of {}'.format(selection_threshold)) randomized_logistic = RandomizedLogisticRegression( selection_threshold=selection_threshold) randomized_logistic.fit(X_train, y_train) mask = randomized_logistic.get_support() features = np.array(names)[mask] print('found {} ngrams:'.format(len([f for f in features]))) print([f for f in features]) return features
def rank_features(algorithm, X, y): # The RFE approach can be used with various different classifiers if algorithm == 'random_forest_rfe': from sklearn.ensemble import RandomForestClassifier from sklearn.feature_selection import RFE estimator = RandomForestClassifier(n_estimators=50, random_state=R_SEED, n_jobs=1) selector = RFE(estimator, 5, step=0.1) selector.fit(X, y) for x in sorted( zip(map(lambda x: round(x, 4), selector.ranking_), features)): print x[1] elif algorithm == 'svm_rfe': from sklearn.svm import SVC from sklearn.feature_selection import RFE estimator = SVC(random_state=R_SEED, kernel='linear') selector = RFE(estimator, 5, step=0.1) selector.fit(X, y) for x in sorted( zip(map(lambda x: round(x, 4), selector.ranking_), features)): print x[1] elif algorithm == 'random_logistic_regression': # See http://blog.datadive.net/selecting-good-features-part-iv-stability-selection-rfe-and-everything-side-by-side/ from sklearn.linear_model import RandomizedLogisticRegression rlasso = RandomizedLogisticRegression(random_state=R_SEED) rlasso.fit(X, y) for x in sorted(zip(map(lambda x: round(x, 4), rlasso.scores_), features), reverse=True): print x[1] elif algorithm == 'random_lasso': from sklearn.linear_model import RandomizedLasso rlasso = RandomizedLasso(random_state=R_SEED) #rlasso = RandomizedLasso(alpha=0.025, random_state=R_SEED) rlasso.fit(X, y) for x in sorted(zip(map(lambda x: round(x, 4), rlasso.scores_), features), reverse=True): print x[1] elif algorithm == 'anova': from sklearn.feature_selection import f_classif F, pval = f_classif(X, y) random_array = random.random(len(pval)) order = lexsort((random_array, pval)) # will break ties by random for i in order: print features[i] else: print "Invalid algorithm: %s" % algorithm exit(1)
def run_logreg(X_train, y_train, selection_threshold=0.2): print('\nrunning logistic regression...') print('using a selection threshold of {}'.format(selection_threshold)) pipe = Pipeline([ ('feature_selection', RandomizedLogisticRegression( selection_threshold=selection_threshold)), ('classification', LogisticRegression()) ]) pipe.fit(X_train, y_train) print('training accuracy : {}'.format(pipe.score(X_train, y_train))) print('testing accuracy : {}'.format(pipe.score(X_test, y_test))) return pipe
def feature_method_selection(data, label, fsname): """ select features by option 'fsname' :param data: :param label: :param fsname: :return: new_data, selected data :return: selected_features_inx, the index of selected feature, starts with 0 """ if fsname == 'variance_threshold': #变化不大就舍弃,离散值 model = VarianceThreshold() #th=1 return model.fit_transform(data) elif fsname == 'select_kbest': model = SelectKBest(chi2, k=10) #特征值必须非负,chi2是分类 elif fsname == 'rfe':#递归消除,耗时很长 svc = SVC(kernel='linear', C=1) model = RFE(estimator=svc, n_features_to_select=10, step=1) elif fsname == 'rfecv': #交叉验证执行执行REF,label必须是数值 svc = SVC(kernel="linear") rfecv = RFECV(estimator=svc, step=1, cv=StratifiedKFold(label, 1), scoring='accuracy') elif fsname == 'RandLasso':#打乱重新选择,cannot perform reduce with flexible type model = RandomizedLogisticRegression() elif fsname == 'linear_svc': model = LinearSVC() #没有importance elif fsname == 'tree': model = ExtraTreesClassifier() elif fsname == 'fclassif': model = SelectFpr() #默认是f_classif,值越大,特征越有用 elif fsname == 'pearsonr': #label必须是数值 label = turn_label_2num(label)#结果是两个sample的相关性 res = pearsonr(data,label) elif fsname == 'RandForReg': #label必须是数值 label = turn_label_2num(label) model = RandomForestRegressor() else: logging.error('ERROR: feature selection option is wrong') model.fit(data, label) new_data = model.transform(data) # selected importanted data return new_data
def getElgiibleFeatures(allFeatureParam, allLabelParam): ''' reff for paper : http://scikit-learn.org/stable/modules/feature_selection.html#randomized-l1 http://scikit-learn.org/stable/modules/generated/sklearn.linear_model.RandomizedLogisticRegression.html ''' logiRegObj = RandomizedLogisticRegression() logiRegObj.fit(allFeatureParam, allLabelParam) ### Output ### #print "Model score: ", logiRegObj.scores_ eligible_indices = logiRegObj.get_support(indices=True) return eligible_indices
def test_rflasso(): train_X, test_X, train_Y, test_Y = train_test_split(index_data, index_lable, test_size=0.25, random_state=1) from sklearn.linear_model import LogisticRegression from sklearn.feature_selection import SelectFromModel from sklearn.svm import SVC from sklearn.cross_validation import StratifiedKFold from sklearn.linear_model import RandomizedLogisticRegression randomized_logistic = RandomizedLogisticRegression(C=0.1, n_jobs=2) randomized_logistic.fit(train_X, train_Y) XX = randomized_logistic.transform(train_X) print XX.shape
def set_data(self, user_atts, inter_atts, responses): self.build_data_representations(user_atts, inter_atts) # Convert from dict representation into matrix: predictor_rows = self.dict_vectorizer.fit_transform(self.dicts_rep).toarray() print(predictor_rows) print('Finding optimal feature set...') self.ff_model = RandomizedLogisticRegression() # Finds best set of features # Fit data and get transformed input rows: X_new = self.ff_model.fit_transform(predictor_rows, responses) print(X_new) print('Done! Final Shape: ' + str(X_new.shape)) print('Building Final model...') self.model = LogisticRegression().fit(X_new, responses) print('Done!')
def select_features(X, y): ''' Select the relevant features from X that are useful for predicting the labels in y. Args: X: numpy 2D array containing input features y: numpy 1D array containing labels Returns: feature_list: List of indices of the selected important features ''' # Get the selection model (stability selection) selection_model = RandomizedLogisticRegression(random_state=0) selection_model.fit(X, y) # Use a cross validated logistic regression to choose the importance # threshold at which a feature is included step_size = 50 max_weight = int(max(selection_model.scores_)) + 1 trial_thresholds = [ i / step_size for i in range(1, max_weight * step_size + 1) ] threshold = 0 max_score = 0 for trial in trial_thresholds: selected_features = [ i for i, score in enumerate(selection_model.scores_) if score > trial ] if len(selected_features) > 0: X_reduced = X[:, selected_features] model = LogisticRegression(multi_class='multinomial', class_weight='balanced', solver='newton-cg', random_state=0, max_iter=1000) scores = cross_val_score(model, X_reduced, y, cv=5) score = scores.mean() if score >= max_score: max_score = score threshold = trial / step_size importance = {i: s for i, s in enumerate(selection_model.scores_)} return [ i for i, score in enumerate(selection_model.scores_) if score > threshold ]
def stability_test(x, y, model, names, score_type): if score_type != "r2": rlasso = RandomizedLogisticRegression() rlasso.fit(x, y) else: rlasso = RandomizedLasso(alpha=0.025) rlasso.fit(x, y) if sum(rlasso.scores_) == 0: return [[0, el] for el in names] maxval = max(rlasso.scores_) minval = min(rlasso.scores_) dist = maxval - minval return list( zip(map(lambda x: round(x, 4), (rlasso.scores_ - minval) / dist), names))
def feature_selection_tech(predictors, responses, test_predictors, selectFeatTech): if(selectFeatTech==0): t=int(predictors.shape[1]*0.40); t=40; model = SelectKBest(chi2, k=t).fit(predictors, responses); predictors_new = model.transform(predictors); predictors_test_new = model.transform(test_predictors); indices = model.get_support(indices=True); if(selectFeatTech==1): randomized_logistic = RandomizedLogisticRegression(); model = randomized_logistic.fit(predictors, responses); predictors_new = model.transform(predictors); predictors_test_new = model.transform(test_predictors); indices = model.get_support(indices=True); return predictors_new, predictors_test_new, indices;
def run_logreg(X_train, y_train, selection_threshold=0.2): print("\nrunning logistic regression...") print("using a selection threshold of {}".format(selection_threshold)) pipe = Pipeline( [ ( "feature_selection", RandomizedLogisticRegression(selection_threshold=selection_threshold), ), ("classification", LogisticRegression()), ] ) pipe.fit(X_train, y_train) print("training accuracy : {}".format(pipe.score(X_train, y_train))) print("testing accuracy : {}".format(pipe.score(X_test, y_test))) return pipe
def log_reg_feat_selection(X_train, y_train, X_valid, y_valid, random_state): """ Feature selection based on the scores given to the features by the RandomizedLogisticRegression algorithm. """ rlr = RandomizedLogisticRegression(C=[0.001, 0.01, 0.1, 1.], sample_fraction=0.7, n_resampling=200, selection_threshold=0.25, verbose=5, n_jobs=-1, random_state=0) rlr.fit(X_train, y_train) np.save('save/feat_sel_log_reg.npy', rlr.scores_) return rlr.scores_
def rank_random_logistic_regression(self, features_indep_df: PandasDataFrame, feature_target: List, n_jobs: int = -1, **kwargs: Any) -> object: """Use Randomized Logistic Regression to rank features. Attributes: model.scores_ model.all_scores_ :param features_indep_df: the independent features, which are inputted into the model. :param feature_target: the target feature, which is being estimated. :param n_jobs: number of CPUs to use during the resampling. If ‘-1’, use all the CPUs. :param kwargs: C=1, scaling=0.5, sample_fraction=0.75, n_resampling=200, selection_threshold=0.25, tol=0.001, fit_intercept=True, verbose=False, normalize=True, random_state=None, pre_dispatch='3*n_jobs' :return: the importance ranking model. """ self.__logger.debug("Run Random Logistic Regression.") classifier = RandomizedLogisticRegression(n_jobs=n_jobs, **kwargs) return classifier.fit(features_indep_df, feature_target)
def _run_randomized_regression(self, feature_df, annotation, clinical_column, sample_fraction=0.7): annotation = copy.deepcopy(annotation) # Encode labels of the classes le = preprocessing.LabelEncoder() annotation[clinical_column] = le.fit_transform( annotation[clinical_column]) clf = RandomizedLogisticRegression( n_resampling=self.rr_iterations, sample_fraction=sample_fraction, n_jobs=1, verbose=1, ).fit(feature_df, annotation[clinical_column]) selected_features = feature_df.T[clf.scores_ != 0].index logger.info("Number of selected features: %d", len(selected_features)) return selected_features, clf
def rdlg_variables(X, y, threshold=0.25):#默认阈值0.25 """ #随机逻辑回归选择与y线性关系的变量(稳定性选择1)。 #在不同数据子集和特征子集上运行特征选择算法(rlr),最终汇总选择结果 #不同的子集上建立模型,然后汇总最终确定特征得分 稳定性选择是一种基于二次抽样和选择算法相结合较新的方法,选择算法可以是回归、SVM或其他类似的方法。 它的主要思想是在不同的数据子集和特征子集上运行特征选择算法,不断的重复,最终汇总特征选择结果, 比如可以统计某个特征被认为是重要特征的频率(被选为重要特征的次数除以它所在的子集被测试的次数)。 理想情况下,重要特征的得分会接近100%。稍微弱一点的特征得分会是非0的数,而最无用的特征得分将会接近于0。 总的来说,好的特征不会因为有相似的特征、关联特征而得分为0,这跟Lasso是不同的。 对于特征选择任务,在许多数据集和环境下,稳定性选择往往是性能最好的方法之一。 """ rlr = RandomizedLogisticRegression(selection_threshold = threshold) #随机逻辑回归 rlr.fit(X, y) scoretable = pd.DataFrame(rlr.all_scores_, index = X.columns) #汇总最终确定特征得分 scoretable = scoretable.reset_index() scoretable = scoretable.rename(columns = {'index':'Col', 0:'value_retio'}, copy = False) df_score = scoretable[scoretable.value_retio > threshold] #删掉缺失值<0.25的数据 refesh_data = X[list(df_score['Col'])] return scoretable,refesh_data
def pick_variables(self, descover=True, method="rlr", threshold=0.25, auto_pick=True): #默认阈值0.25 #挑选变量助手(特征选择) if method == "rlr": """ #顶层特征选择算法 #随机逻辑回归选择与y线性关系的变量(稳定性选择1)。 #在不同数据子集和特征子集上运行特征选择算法(rlr),最终汇总选择结果 #不同的子集上建立模型,然后汇总最终确定特征得分 稳定性选择是一种基于二次抽样和选择算法相结合较新的方法,选择算法可以是回归、SVM或其他类似的方法。 它的主要思想是在不同的数据子集和特征子集上运行特征选择算法,不断的重复,最终汇总特征选择结果, 比如可以统计某个特征被认为是重要特征的频率(被选为重要特征的次数除以它所在的子集被测试的次数)。 理想情况下,重要特征的得分会接近100%。稍微弱一点的特征得分会是非0的数,而最无用的特征得分将会接近于0。 RandomizedLogisticRegression() fit(X, y) Fit the model using X, y as training data. fit_transform(X[, y]) Fit to data, then transform it. get_params([deep]) Get parameters for this estimator. get_support([indices]) Get a mask, or integer index, of the features selected inverse_transform(X) Reverse the transformation operation set_params(**params) Set the parameters of this estimator. transform(X) Reduce X to the selected features. """ rlr = RandomizedLogisticRegression( selection_threshold=threshold) #随机逻辑回归 rlr.fit(self.X_train, self.y_train) scoretable = pd.DataFrame(rlr.all_scores_, index=self.X_train.columns, columns=['var_score']) #汇总最终确定特征得分 columns_need = list(self.X_train.columns[rlr.get_support( )]) # Get a mask, or integer index, of the features selected self.X_train = self.X_train[columns_need] self.X_test = self.X_test[columns_need] columns_need.append("y") if auto_pick: self.picked_data = self.data[columns_need] return scoretable
sys.stderr.write("# of initial features: %d\n" % (len(registered_feat_names))) sys.stderr.write("# of transformed features: %d\n" % (len(Xtr.toarray()[0]))) sel = None n = len(feature_names) if (selector == "kbest"): sel = SelectKBest(chi2, k=n) elif (selector == "kbest_anova"): sel = SelectKBest(f_classif, k=n) elif (selector == "rfecv"): sel = RFECV() elif (selector == "lasso"): sel = SelectFromModel(LassoCV(), threshold=0.005) elif (selector == "rlregr"): sel = RandomizedLogisticRegression() elif (selector == "svm"): sel = eval( "SelectFromModel(LinearSVC(%s))" % (args.selector_params) ) elif (selector == "extra_trees"): sel = SelectFromModel(ExtraTreesClassifier()) elif (selector == "random_forest"): sel = SelectFromModel(RandomForestClassifier()) print sel.estimator if (type(sel) == SelectFromModel and type(sel.estimator) == LassoCV): sel.fit(Xtr, Ytr) top_ranked = sorted(enumerate(sel.estimator_.coef_), key=lambda x:x[1], reverse=True) top_indices = map(list,zip(*top_ranked))[0] for feat,pval in zip(np.asarray(vectorizer.get_feature_names())[top_indices],sel.estimator_.coef_[top_indices]): print "%s\t%s" % (feat, pval) elif (type(sel) == SelectFromModel and (type(sel.estimator) == ExtraTreesClassifier or type(sel.estimator) == RandomForestClassifier)):
SelectFwe, # TODO: add tests and document GenericUnivariateSelect, VarianceThreshold, RFE, RFECV, SelectFromModel, ) from sklearn.linear_model import LogisticRegression _additional_test_cases = [] try: from sklearn.linear_model import ( # type: ignore RandomizedLogisticRegression, RandomizedLasso, # TODO: add tests and document ) _additional_test_cases.append( (RandomizedLogisticRegression(random_state=42), ['<NAME1>', '<NAME2>', '<NAME3>'])) except ImportError: # Removed in scikit-learn 0.21 pass from sklearn.preprocessing import ( MinMaxScaler, StandardScaler, MaxAbsScaler, RobustScaler, ) from sklearn.pipeline import FeatureUnion, make_pipeline from eli5 import transform_feature_names from eli5.sklearn import PermutationImportance