def feature_selection_class(predictors, responses, test_predictors, selectFeatTech): if (selectFeatTech == 0): #t=int(predictors.shape[1]*0.40) t = 500 # no of features you want to select model = SelectKBest(chi2, k=t).fit(predictors.replace(-1, 0), responses) #print model.scores_ predictors_new = model.transform(predictors) predictors_test_new = model.transform(test_predictors) indices = model.get_support(indices=True) if (selectFeatTech == 1): randomized_logistic = RandomizedLogisticRegression() model = randomized_logistic.fit(predictors, responses) predictors_new = model.transform(predictors) predictors_test_new = model.transform(test_predictors) indices = model.get_support(indices=True) column_names = predictors.columns[indices] predictors_new = pd.DataFrame(predictors_new, index=predictors.index, columns=column_names) predictors_test_new = pd.DataFrame(predictors_test_new, index=test_predictors.index, columns=column_names) return predictors_new, predictors_test_new
def statiblity(X, Y): from sklearn.linear_model import RandomizedLogisticRegression clf = RandomizedLogisticRegression(random_state=1) clf.fit(X, Y) return clf.scores_
def randomized_Logistic_regression(self): X = self.data[:, 1:len(self.data[0])] y = self.data[:, 0] randomized_logistic = RandomizedLogisticRegression() randomized_logistic.fit(X, y) a = randomized_logistic.get_support() selected = np.where(a)
def run(self, df, target_label): target = df[target_label] feature = df.drop(target_label, axis=1) clf = RandomizedLogisticRegression() for col in feature.columns: if np.any(np.isnan(feature[col].values)) or np.any( np.isinf(feature[col].values)): print(list(feature[col].values)) try: clf.fit(feature.values, target.values) except: for col in feature.columns: print(list(feature[col].values)) scores = {} for col_index in range(len(feature.columns)): scores[feature.columns[col_index]] = abs(clf.scores_[col_index]) scores = sorted(scores.items(), key=lambda x: x[1], reverse=True) print(scores) position = {} i = 0 for col, _ in scores: position[col] = i i += 1 print(position) return position
def feture_select_RLR(): data_x, data_y, names = get_data() rlr = RLR() rlr.fit(data_x, data_y) return sorted(zip(names, map(lambda x: round(x, 4), rlr.scores_)), key=lambda x: x[1], reverse=True)
def Feature_sort(Feat_scale, Label, threads=4): ##通过三种特征选择方法对特征进行排序 ranks = {} ## Univariate feature selection Selector = SelectKBest(f_classif, k='all') Selector.fit_transform(Feat_scale, Label) ranks["Univariate_f"] = np.argsort(Selector.pvalues_) ## RandomizedLogistic regression n_jobs=**s, more robust result from bigger n_resampling ##从第1900左右起,后续的特征排序得较为可疑。 rlogreg = RandomizedLogisticRegression(n_jobs=1, n_resampling=2000, selection_threshold=0, verbose=False, random_state=0) ##DeprecationWarning: Class RandomizedLogisticRegression is deprecated; The class RandomizedLogisticRegression is deprecated in 0.19 and will be removed in 0.21. ##warnings.warn(msg, category=DeprecationWarning) rlogreg.fit(Feat_scale, Label) ranks["Randomized_Logistic_f"] = np.argsort(-abs(rlogreg.scores_)) ## boruta based on randomforest n_jobs=** rf = RandomForestClassifier(random_state=0, n_jobs=threads, max_features='auto') feat_selector = BorutaPy(rf, n_estimators='auto', perc=80, random_state=0) feat_selector.fit(Feat_scale, Label) ranks["Boruta_f"] = np.argsort(feat_selector.ranking_) return (ranks)
def randomized_Logistic_regression(self): X = self.data[:,1:len(self.data[0])] y = self.data[:,0] randomized_logistic = RandomizedLogisticRegression() randomized_logistic.fit(X,y) a = randomized_logistic.get_support() selected = np.where(a)
def feature_selection(train,test,y): print "特征选择" clf = RLR(C=10,scaling=0.5,sample_fraction=0.6,n_resampling=200,selection_threshold=0.4,n_jobs=3) clf.fit(train,y) train = clf.transform(train) test = clf.transform(test) return train,test
def randlogistic(self, selection_threshold=0.25, sample_fraction=0.75): rlr_model = RandomizedLogisticRegression( C=self.C, selection_threshold=selection_threshold, normalize=False, sample_fraction=sample_fraction) rlr_model.fit(self.data.values, self.target.values) return rlr_model
def get_features(X_train, y_train, names, selection_threshold=0.2): print('\ngetting features with randomized logistic regression...') print('using a selection threshold of {}'.format(selection_threshold)) randomized_logistic = RandomizedLogisticRegression( selection_threshold=selection_threshold) randomized_logistic.fit(X_train, y_train) mask = randomized_logistic.get_support() features = np.array(names)[mask] print('found {} ngrams:'.format(len([f for f in features]))) print([f for f in features]) return features
def evaluate_stability(vocab, id_to_vec, mesh_to_id): labels = ('Male', 'Female', 'Both') Xs, ids = get_basic_Xs(id_to_vec, mesh_to_id, shuffle=True) Xtr, Ytr, Itr, Xte, Yte, Ite = get_test_train(labels, ids, Xs, 5) print 'Fitting RandomizedLR...' logreg = RandomizedLogisticRegression(verbose=True, n_resampling=1000, n_jobs=16) logreg.fit(Xtr, Ytr) scores = logreg.scores_ return {vocab[i]: score for i, score in enumerate(scores)}
def compute_randomized_lr_score(data_set_df, user_info_df, label='gender'): # print "\t\t\tfilling nan values..." df_filtered, y_v = pc.get_filtered_x_y(data_set_df, user_info_df, label) x = df_filtered.dropna(how='all') x_imp = pc.fill_nan_features(x) if x.isnull().any().any() else x.values clf = RandomizedLogisticRegression() # print "\t\t\tfitting LR model..." clf.fit(x_imp.T, y_v) feature_importances = DataFrame(clf.scores_, index=df_filtered.index, columns=['importance']) feature_importances.sort_values('importance', ascending=False, inplace=True, na_position='last') return feature_importances
def classify_logistic(): print "logistic regression" (X_train, y_train), (X_test, y_test) = util.load_all_feat() print "original X_train shape", X_train.shape clf = RandomizedLogisticRegression(n_jobs=2) clf.fit(X_train, y_train) # clf = LogisticRegression() # clf.fit(X_train, y_train) pred = clf.predict(X_test) print "accuracy score:", accuracy_score(y_test, pred) import ipdb; ipdb.set_trace() # XXX BREAKPOINT
def getElgiibleFeatures(allFeatureParam, allLabelParam): ''' reff for paper : http://scikit-learn.org/stable/modules/feature_selection.html#randomized-l1 http://scikit-learn.org/stable/modules/generated/sklearn.linear_model.RandomizedLogisticRegression.html ''' logiRegObj = RandomizedLogisticRegression() logiRegObj.fit(allFeatureParam, allLabelParam) ### Output ### #print "Model score: ", logiRegObj.scores_ eligible_indices = logiRegObj.get_support(indices=True) return eligible_indices
def randomlr(train_x,train_y,cv_x,test_x,regp,alpha=0.5): # Create the random forest object which will include all the parameters # for the fit randomlr = RandomizedLogisticRegression(C=regp,scaling=alpha,fit_intercept=True,sample_fraction=0.75,n_resampling=200) # Fit the training data to the Survived labels and create the decision trees randomlr = randomlr.fit(train_x,train_y) train_x = randomlr.fit_transform(train_x,train_y) cv_x = randomlr.transform(cv_x) test_x = randomlr.transform(test_x) return train_x,cv_x,test_x
def set_data(self, user_atts, inter_atts, responses): self.build_data_representations(user_atts, inter_atts) # Convert from dict representation into matrix: predictor_rows = self.dict_vectorizer.fit_transform(self.dicts_rep).toarray() print(predictor_rows) print('Finding optimal feature set...') self.ff_model = RandomizedLogisticRegression() # Finds best set of features # Fit data and get transformed input rows: X_new = self.ff_model.fit_transform(predictor_rows, responses) print(X_new) print('Done! Final Shape: ' + str(X_new.shape)) print('Building Final model...') self.model = LogisticRegression().fit(X_new, responses) print('Done!')
def test_rflasso(): train_X, test_X, train_Y, test_Y = train_test_split(index_data, index_lable, test_size=0.25, random_state=1) from sklearn.linear_model import LogisticRegression from sklearn.feature_selection import SelectFromModel from sklearn.svm import SVC from sklearn.cross_validation import StratifiedKFold from sklearn.linear_model import RandomizedLogisticRegression randomized_logistic = RandomizedLogisticRegression(C=0.1, n_jobs=2) randomized_logistic.fit(train_X, train_Y) XX = randomized_logistic.transform(train_X) print XX.shape
def lasso_regression(X, y): """ Use Randomized Logistic Regression to select the features based on the coefficient values """ clf = RandomizedLogisticRegression(C=1.0) clf.fit(X, y) print('Number of non zero valued coefficients: ', np.sum(clf.scores_ > 0)) imp_feature_idx = clf.scores_.argsort() qualities = [] X_train, X_test, y_train, y_test = split_examples(X, y) for i in range(0, 100, 4): clf = LogisticRegression(C=0.1) clf.fit(X_train[:, imp_feature_idx[i:]], y_train) q = roc_auc_score( y_test, clf.predict_proba(X_test[:, imp_feature_idx[i:]])[:, 1]) qualities.append(q) plt.plot(range(0, 100, 4), qualities) plt.show() return qualities
def log_reg_feat_selection(X_train, y_train, X_valid, y_valid, random_state): """ Feature selection based on the scores given to the features by the RandomizedLogisticRegression algorithm. """ rlr = RandomizedLogisticRegression(C=[0.001, 0.01, 0.1, 1.], sample_fraction=0.7, n_resampling=200, selection_threshold=0.25, verbose=5, n_jobs=-1, random_state=0) rlr.fit(X_train, y_train) np.save('save/feat_sel_log_reg.npy', rlr.scores_) return rlr.scores_
def select_features(X, y): ''' Select the relevant features from X that are useful for predicting the labels in y. Args: X: numpy 2D array containing input features y: numpy 1D array containing labels Returns: feature_list: List of indices of the selected important features ''' # Get the selection model (stability selection) selection_model = RandomizedLogisticRegression(random_state=0) selection_model.fit(X, y) # Use a cross validated logistic regression to choose the importance # threshold at which a feature is included step_size = 50 max_weight = int(max(selection_model.scores_)) + 1 trial_thresholds = [ i / step_size for i in range(1, max_weight * step_size + 1) ] threshold = 0 max_score = 0 for trial in trial_thresholds: selected_features = [ i for i, score in enumerate(selection_model.scores_) if score > trial ] if len(selected_features) > 0: X_reduced = X[:, selected_features] model = LogisticRegression(multi_class='multinomial', class_weight='balanced', solver='newton-cg', random_state=0, max_iter=1000) scores = cross_val_score(model, X_reduced, y, cv=5) score = scores.mean() if score >= max_score: max_score = score threshold = trial / step_size importance = {i: s for i, s in enumerate(selection_model.scores_)} return [ i for i, score in enumerate(selection_model.scores_) if score > threshold ]
def learning_curves(X, y, clf, params, train_sizes=None, feature_selection=False, n_folds=3, scoring='accuracy'): """ Builds learning curves on test set, with parameters chosen on train and validation set using nested cross validation :param X: data :param y: labels :param clf: classificator :param params: parameters for grid search :param train_sizes: train sizes for building learning curves :param feature_selection: whether to choose features by randomized logistic regression :param n_folds: number of outed cv folds :param scoring: scoring metric :return: train and test curve """ if train_sizes is None: train_sizes = np.linspace(0.5, 1.0, 5) kf = KFold(X.shape[0], n_folds=n_folds) train_curve = np.zeros_like(train_sizes) test_curve = np.zeros_like(train_sizes) for train_inds, test_inds in kf: train_data = X[train_inds] test_data = X[test_inds] train_labels = y[train_inds] test_labels = y[test_inds] if feature_selection: rlr = RandomizedLogisticRegression() rlr.fit(train_data, train_labels) inds = [i for i in range(X.shape[1]) if rlr.all_scores_[i] > 0.0] print len(inds), ' features chosen' train_data = train_data[:, inds] gs = GridSearchCV(clf, params, scoring=scoring, cv=5) gs.fit(train_data, train_labels) bp = gs.best_params_ print 'chosen params: ', bp for p in bp: setattr(clf, p, bp[p]) lc = learning_curve(clf, test_data, test_labels, scoring=scoring, train_sizes=train_sizes) train_curve += lc[1].mean(axis=1) test_curve += lc[2].mean(axis=1) train_curve /= n_folds test_curve /= n_folds return train_curve, test_curve
def feature_selection_tech(predictors, responses, test_predictors, selectFeatTech): if(selectFeatTech==0): t=int(predictors.shape[1]*0.40); t=40; model = SelectKBest(chi2, k=t).fit(predictors, responses); predictors_new = model.transform(predictors); predictors_test_new = model.transform(test_predictors); indices = model.get_support(indices=True); if(selectFeatTech==1): randomized_logistic = RandomizedLogisticRegression(); model = randomized_logistic.fit(predictors, responses); predictors_new = model.transform(predictors); predictors_test_new = model.transform(test_predictors); indices = model.get_support(indices=True); return predictors_new, predictors_test_new, indices;
def predictWithAdaBoost(config, X, Y, testFeatures): adaConfig = config.getConfig('model/adaboost') if adaConfig.get('useRandomLog', False): clf = RandomizedLogisticRegression() clf.fit(X, Y) X_new = clf.transform(X) if not X_new.size == 0: X = X_new testFeatures = clf.transform(testFeatures) clf = AdaBoostClassifier(n_estimators=50,learning_rate=1.0, algorithm='SAMME.R') clf.fit(X,Y) return clf.predict(testFeatures)
def perform_stability_selection(X_train, y_train, round_id = 0) : # Defaults: RandomizedLasso(alpha='aic', scaling=0.5, sample_fraction=0.75, n_resampling=200, n_jobs = 1) X_train = perform_scaling (X_train, scaling = 'minmax') #logistic = LogisticRegression(penalty = 'l2', class_weight = 'auto', max_iter = 1000, random_state = 30) #logistic.fit(X_train, y_train) print ("Round%d - Stability selection -" %(round_id)) #print ("Logistic (L1 penalty) Feature_Importances: ", sorted(zip(map(lambda x: round(x, 5), logistic.coef_), header[1:]), # reverse=True)) #print ("Logistic Feature_Importances: ", logistic.coef_) rlog = RandomizedLogisticRegression(random_state = 30, n_jobs = 3, n_resampling = 400) rlog.fit(X_train, y_train) print ("Randomized Logistic Feature_Importances: ", rlog.scores_) print ("Randomized Logistic Feature_Importances: ", sorted(zip(map(lambda x: round(x, 5), rlog.scores_), header[1:]), reverse=True))
def build_classifier(definition, datas): if definition['classification'] == 'lr': classifier = LogisticRegression(C=1.5) elif definition['classification'] == 'sgd': classifier = SGDClassifier(alpha=0.0001, n_iter=10**2) elif definition['classification'] == 'sgd_grid': best_params = grid_search_params(datas) classifier = SGDClassifier(n_iter=10**2, **best_params) rlr_feature_selection = RandomizedLogisticRegression(C=1.5, n_jobs=-1, verbose=0) # Standard sklearn classifier clf = Pipeline([ # ('string_encoder', pp_encode_strings), # ('drop_nan_cols', pp_drop_nan_cols), # ('fix_collinear', pp_fix_collinear), # # ('float_imputer', pp_imputer), # ('scaler', pp_scaler), # ('feature_selection', rlr_feature_selection), ('classification', classifier) ]) return clf
def select_top_features(X, y, ques_dict, top_count=7): ''' Run RandomizedLogisticRegression and return top number of features Args: X(dataframe) -- features y(dataframe) -- outcome ques_dict(dict) -- variable name to questions dictionary top_count(int) -- number of top features to return ''' rand_log = RandomizedLogisticRegression() X_feat = rand_log.fit(X, y) questions = features_to_questions(X.columns, ques_dict) all_features = sorted(zip(questions, X_feat.scores_), key=lambda tup: tup[1], reverse=True) top_features = [f for f in all_features if f[1] > 0][:top_count] return top_features
def _get_clfs(self): clf_dict = {"rlrclf": RandomizedLogisticRegression(), "rfclf": RandomForestClassifier(criterion='entropy'), "dtrclf": DecisionTreeClassifier(criterion='entropy'), "lrclf": LogisticRegression() } return clf_dict
def get_feature_selection_model_from_name(type_of_estimator, model_name): # TODO(PRESTON): eventually let threshold be user-configurable (or grid_searchable) # TODO(PRESTON): optimize the params used here model_map = { 'classifier': { 'SelectFromModel': SelectFromModel(RandomForestClassifier(n_jobs=-1)), 'RFECV': RFECV(estimator=RandomForestClassifier(n_jobs=-1), step=0.1), 'GenericUnivariateSelect': GenericUnivariateSelect(), 'RandomizedSparse': RandomizedLogisticRegression(), 'KeepAll': 'KeepAll' }, 'regressor': { 'SelectFromModel': SelectFromModel(RandomForestRegressor(n_jobs=-1)), 'RFECV': RFECV(estimator=RandomForestRegressor(n_jobs=-1), step=0.1), 'GenericUnivariateSelect': GenericUnivariateSelect(), 'RandomizedSparse': RandomizedLasso(), 'KeepAll': 'KeepAll' } } return model_map[type_of_estimator][model_name]
def get_feature_selection_model_from_name(type_of_estimator, model_name): model_map = { 'classifier': { 'SelectFromModel': SelectFromModel(RandomForestClassifier(n_jobs=-1, max_depth=10, n_estimators=15), threshold='20*mean'), 'RFECV': RFECV(estimator=RandomForestClassifier(n_jobs=-1), step=0.1), 'GenericUnivariateSelect': GenericUnivariateSelect(), 'RandomizedSparse': RandomizedLogisticRegression(), 'KeepAll': 'KeepAll' }, 'regressor': { 'SelectFromModel': SelectFromModel(RandomForestRegressor(n_jobs=-1, max_depth=10, n_estimators=15), threshold='0.7*mean'), 'RFECV': RFECV(estimator=RandomForestRegressor(n_jobs=-1), step=0.1), 'GenericUnivariateSelect': GenericUnivariateSelect(), 'RandomizedSparse': RandomizedLasso(), 'KeepAll': 'KeepAll' } } return model_map[type_of_estimator][model_name]
def predictWithQDA(config, X, Y, testFeatures): qdaConfig = config.getConfig('model/qda') if qdaConfig.get('useRandomLog', False): clf = RandomizedLogisticRegression() clf.fit(X, Y) X_new = clf.transform(X) if not X_new.size == 0: X = X_new testFeatures = clf.transform(testFeatures) priors = qdaConfig.get('priors', None) clf = QDA(priors = priors) clf.fit(X, Y) return clf.predict(testFeatures)
class LogisticModelBuilder(object): def __init__(self): self.inter_levels = None self.dicts_rep = None self.dict_vectorizer = DictVectorizer() self.ff_model = None self.model = None def set_data(self, user_atts, inter_atts, responses): self.build_data_representations(user_atts, inter_atts) # Convert from dict representation into matrix: predictor_rows = self.dict_vectorizer.fit_transform(self.dicts_rep).toarray() print(predictor_rows) print('Finding optimal feature set...') self.ff_model = RandomizedLogisticRegression() # Finds best set of features # Fit data and get transformed input rows: X_new = self.ff_model.fit_transform(predictor_rows, responses) print(X_new) print('Done! Final Shape: ' + str(X_new.shape)) print('Building Final model...') self.model = LogisticRegression().fit(X_new, responses) print('Done!') # Set data based on tuples/rows def set_data_rows(self, tuples): self.set_data(*ut.unzip(tuples)) # Builds a list-of-dictionaries representation and builds # msg/interaction factor level matrix. def build_data_representations(self, user_atts, inter_atts): print('Building internal data representations...') print(' Building factor level matrix...') itp = map(lambda x: set(x), zip(*inter_atts)) # transpose and get row sets self.inter_levels = map(lambda x: x if len(filter(lambda y: type(y) == type(''), x)) > 0 else (min(x), max(x)), itp) print(' Building dict list representation...') self.dicts_rep = dict_list_representation(user_atts, inter_atts) print('Done!') # Returns a function of form f: X x Y -> P # where X = <user_att vals>, Y = <inter. att vals>, and P = P(R = 1) def prob_f(self): dv = self.dict_vectorizer dlr = lambda x, y: dict_list_representation([x], [y]) ff = self.ff_model mod = self.model f = lambda X, Y: mod.predict_proba(ff.transform(dv.transform(dlr(X, Y)).toarray())) return lambda X, Y: map(lambda z: z[1], f(X, Y))[0] # Return a vector of interaction attribute levels corresponding to each # interaction attribute. For each attribute the following rule is applied: # 1) If the attribute is categorical the attribute levels are a list of unique values # 2) If the attribute is numeric then a pair (min, max) is returned bounding the values. def inter_attr_levels(self): return map(lambda lv: lv if type(lv) == type(()) else list(lv), self.inter_levels)
def programmer_1(): filename = "data/bankloan.xls" data = pd.read_excel(filename) x = data.iloc[:, :8].as_matrix() y = data.iloc[:, 8].as_matrix() rlr = RLR() rlr.fit(x, y) rlr_support = rlr.get_support() support_col = data.drop('违约', axis=1).columns[rlr_support] print( "rlr_support_columns: {columns}".format(columns=','.join(support_col))) x = data[support_col].as_matrix() lr = LR() lr.fit(x, y) print("lr: {score}".format(score=lr.score(x, y)))
def tipdm_chapter5_test(): # 参数初始化 filename = '../../../MyFile/chapter5/data/bankloan.xls' data = pd.read_excel(filename) x = data.iloc[:,:8].as_matrix() y = data.iloc[:,8].as_matrix() # feature selection rlr = RLR() # 建立随机逻辑回归模型,筛选变量 rlr.fit(x, y) # 训练模型 features = rlr.get_support() # 获取特征筛选结果,也可以通过.scores_方法获取各个特征的分数 print(u'通过随机逻辑回归模型筛选特征结束。') print(u'有效特征为: {0}'.format(','.join(data.columns[features]))) x = data[data.columns[features]].as_matrix() # 筛选好特征 # training and test lr = LR() # 建立逻辑货柜模型 lr.fit(x, y) # 用筛选后的特征数据来训练模型 print(u'逻辑回归模型训练结束。') print(u'模型的平均正确率为: {0}'.format(lr.score(x, y))) # 给出模型的平均正确率
def rank_random_logistic_regression(self, features_indep_df: PandasDataFrame, feature_target: List, n_jobs: int = -1, **kwargs: Any) -> object: """Use Randomized Logistic Regression to rank features. Attributes: model.scores_ model.all_scores_ :param features_indep_df: the independent features, which are inputted into the model. :param feature_target: the target feature, which is being estimated. :param n_jobs: number of CPUs to use during the resampling. If ‘-1’, use all the CPUs. :param kwargs: C=1, scaling=0.5, sample_fraction=0.75, n_resampling=200, selection_threshold=0.25, tol=0.001, fit_intercept=True, verbose=False, normalize=True, random_state=None, pre_dispatch='3*n_jobs' :return: the importance ranking model. """ self.__logger.debug("Run Random Logistic Regression.") classifier = RandomizedLogisticRegression(n_jobs=n_jobs, **kwargs) return classifier.fit(features_indep_df, feature_target)
def feature_method_selection(data, label, fsname): """ select features by option 'fsname' :param data: :param label: :param fsname: :return: new_data, selected data :return: selected_features_inx, the index of selected feature, starts with 0 """ if fsname == 'variance_threshold': #变化不大就舍弃,离散值 model = VarianceThreshold() #th=1 return model.fit_transform(data) elif fsname == 'select_kbest': model = SelectKBest(chi2, k=10) #特征值必须非负,chi2是分类 elif fsname == 'rfe':#递归消除,耗时很长 svc = SVC(kernel='linear', C=1) model = RFE(estimator=svc, n_features_to_select=10, step=1) elif fsname == 'rfecv': #交叉验证执行执行REF,label必须是数值 svc = SVC(kernel="linear") rfecv = RFECV(estimator=svc, step=1, cv=StratifiedKFold(label, 1), scoring='accuracy') elif fsname == 'RandLasso':#打乱重新选择,cannot perform reduce with flexible type model = RandomizedLogisticRegression() elif fsname == 'linear_svc': model = LinearSVC() #没有importance elif fsname == 'tree': model = ExtraTreesClassifier() elif fsname == 'fclassif': model = SelectFpr() #默认是f_classif,值越大,特征越有用 elif fsname == 'pearsonr': #label必须是数值 label = turn_label_2num(label)#结果是两个sample的相关性 res = pearsonr(data,label) elif fsname == 'RandForReg': #label必须是数值 label = turn_label_2num(label) model = RandomForestRegressor() else: logging.error('ERROR: feature selection option is wrong') model.fit(data, label) new_data = model.transform(data) # selected importanted data return new_data
def run_logreg(X_train, y_train, selection_threshold=0.2): print('\nrunning logistic regression...') print('using a selection threshold of {}'.format(selection_threshold)) pipe = Pipeline([ ('feature_selection', RandomizedLogisticRegression( selection_threshold=selection_threshold)), ('classification', LogisticRegression()) ]) pipe.fit(X_train, y_train) print('training accuracy : {}'.format(pipe.score(X_train, y_train))) print('testing accuracy : {}'.format(pipe.score(X_test, y_test))) return pipe
def rank_features(algorithm, X, y): # The RFE approach can be used with various different classifiers if algorithm == 'random_forest_rfe': from sklearn.ensemble import RandomForestClassifier from sklearn.feature_selection import RFE estimator = RandomForestClassifier(n_estimators=50, random_state=R_SEED, n_jobs=1) selector = RFE(estimator, 5, step=0.1) selector.fit(X, y) for x in sorted( zip(map(lambda x: round(x, 4), selector.ranking_), features)): print x[1] elif algorithm == 'svm_rfe': from sklearn.svm import SVC from sklearn.feature_selection import RFE estimator = SVC(random_state=R_SEED, kernel='linear') selector = RFE(estimator, 5, step=0.1) selector.fit(X, y) for x in sorted( zip(map(lambda x: round(x, 4), selector.ranking_), features)): print x[1] elif algorithm == 'random_logistic_regression': # See http://blog.datadive.net/selecting-good-features-part-iv-stability-selection-rfe-and-everything-side-by-side/ from sklearn.linear_model import RandomizedLogisticRegression rlasso = RandomizedLogisticRegression(random_state=R_SEED) rlasso.fit(X, y) for x in sorted(zip(map(lambda x: round(x, 4), rlasso.scores_), features), reverse=True): print x[1] elif algorithm == 'random_lasso': from sklearn.linear_model import RandomizedLasso rlasso = RandomizedLasso(random_state=R_SEED) #rlasso = RandomizedLasso(alpha=0.025, random_state=R_SEED) rlasso.fit(X, y) for x in sorted(zip(map(lambda x: round(x, 4), rlasso.scores_), features), reverse=True): print x[1] elif algorithm == 'anova': from sklearn.feature_selection import f_classif F, pval = f_classif(X, y) random_array = random.random(len(pval)) order = lexsort((random_array, pval)) # will break ties by random for i in order: print features[i] else: print "Invalid algorithm: %s" % algorithm exit(1)
def get_support_fields(X,Y): ''' Function for getting support fields ''' rlr = RLR() #建立随机逻辑回归模型,筛选变量 rlr.fit(X, Y) #训练模型 rlr.get_support() #获取特征筛选结果,也可以通过.scores_方法获取各个特征的分数 print rlr.scores_ print(u'有效特征为:%s' % (','.join(data.columns[rlr.get_support()])).decode('utf-8')) X = data[data.columns[rlr.get_support()]].as_matrix() #筛选好特征 return X
def rdlg_variables(X, y, threshold=0.25):#默认阈值0.25 """ #随机逻辑回归选择与y线性关系的变量(稳定性选择1)。 #在不同数据子集和特征子集上运行特征选择算法(rlr),最终汇总选择结果 #不同的子集上建立模型,然后汇总最终确定特征得分 稳定性选择是一种基于二次抽样和选择算法相结合较新的方法,选择算法可以是回归、SVM或其他类似的方法。 它的主要思想是在不同的数据子集和特征子集上运行特征选择算法,不断的重复,最终汇总特征选择结果, 比如可以统计某个特征被认为是重要特征的频率(被选为重要特征的次数除以它所在的子集被测试的次数)。 理想情况下,重要特征的得分会接近100%。稍微弱一点的特征得分会是非0的数,而最无用的特征得分将会接近于0。 总的来说,好的特征不会因为有相似的特征、关联特征而得分为0,这跟Lasso是不同的。 对于特征选择任务,在许多数据集和环境下,稳定性选择往往是性能最好的方法之一。 """ rlr = RandomizedLogisticRegression(selection_threshold = threshold) #随机逻辑回归 rlr.fit(X, y) scoretable = pd.DataFrame(rlr.all_scores_, index = X.columns) #汇总最终确定特征得分 scoretable = scoretable.reset_index() scoretable = scoretable.rename(columns = {'index':'Col', 0:'value_retio'}, copy = False) df_score = scoretable[scoretable.value_retio > threshold] #删掉缺失值<0.25的数据 refesh_data = X[list(df_score['Col'])] return scoretable,refesh_data
def pick_variables(self, descover=True, method="rlr", threshold=0.25, auto_pick=True): #默认阈值0.25 #挑选变量助手(特征选择) if method == "rlr": """ #顶层特征选择算法 #随机逻辑回归选择与y线性关系的变量(稳定性选择1)。 #在不同数据子集和特征子集上运行特征选择算法(rlr),最终汇总选择结果 #不同的子集上建立模型,然后汇总最终确定特征得分 稳定性选择是一种基于二次抽样和选择算法相结合较新的方法,选择算法可以是回归、SVM或其他类似的方法。 它的主要思想是在不同的数据子集和特征子集上运行特征选择算法,不断的重复,最终汇总特征选择结果, 比如可以统计某个特征被认为是重要特征的频率(被选为重要特征的次数除以它所在的子集被测试的次数)。 理想情况下,重要特征的得分会接近100%。稍微弱一点的特征得分会是非0的数,而最无用的特征得分将会接近于0。 RandomizedLogisticRegression() fit(X, y) Fit the model using X, y as training data. fit_transform(X[, y]) Fit to data, then transform it. get_params([deep]) Get parameters for this estimator. get_support([indices]) Get a mask, or integer index, of the features selected inverse_transform(X) Reverse the transformation operation set_params(**params) Set the parameters of this estimator. transform(X) Reduce X to the selected features. """ rlr = RandomizedLogisticRegression( selection_threshold=threshold) #随机逻辑回归 rlr.fit(self.X_train, self.y_train) scoretable = pd.DataFrame(rlr.all_scores_, index=self.X_train.columns, columns=['var_score']) #汇总最终确定特征得分 columns_need = list(self.X_train.columns[rlr.get_support( )]) # Get a mask, or integer index, of the features selected self.X_train = self.X_train[columns_need] self.X_test = self.X_test[columns_need] columns_need.append("y") if auto_pick: self.picked_data = self.data[columns_need] return scoretable
def programmer_1(): # 参数初始化 filename = r'bankloan.xls' data = pd.read_excel(filename) x = data.iloc[:, :8].as_matrix() # 使用pandas读取文件 就可以不用管label column标签 y = data.iloc[:, 8].as_matrix() rlr = RLR() # 建立随机逻辑回归模型,进行特征选择和变量筛选 rlr.fit(x, y) # 训练模型 egeList = rlr.get_support() # 获取筛选后的特征 egeList = np.append( egeList, False) # 往numpy数组中 添加一个False元素 使用np.append(array,ele)方法 print("rlr.get_support():") print(egeList) print(u'随机逻辑回归模型特征选择结束!!!') print(u'有效特征为:%s' % ','.join(data.columns[egeList])) x = data[data.columns[egeList]].as_matrix() # 筛选好特征值 lr = LR() # 建立逻辑回归模型 lr.fit(x, y) # 用筛选后的特征进行训练 print(u'逻辑回归训练模型结束!!!') print(u'模型的平均正确率:%s' % lr.score(x, y)) # 给出模型的平均正确率,本例为81.4%
def lasso_regression(X, y): """ Use Randomized Logistic Regression to select the features based on the coefficient values """ clf = RandomizedLogisticRegression(C=1.0) clf.fit(X, y) print('Number of non zero valued coefficients: ', np.sum(clf.scores_ > 0)) imp_feature_idx = clf.scores_.argsort() qualities = [] X_train, X_test, y_train, y_test = split_examples(X, y) for i in range(0, 100, 4): clf = LogisticRegression(C=0.1) clf.fit(X_train[:, imp_feature_idx[i:]], y_train) q = roc_auc_score(y_test, clf.predict_proba(X_test[:, imp_feature_idx[i:]])[:, 1]) qualities.append(q) plt.plot(range(0, 100, 4), qualities) plt.show() return qualities
from __future__ import division import numpy as np from sklearn.linear_model import RandomizedLogisticRegression from sklearn.linear_model import LogisticRegression X = np.load("../feats/train_formatted.npy") y = np.load("../feats/train_y.npy") X_test = np.load("../feats/test_formatted.npy") y_test = np.load("../feats/test_y.npy") clf = RandomizedLogisticRegression() clf.fit(X, y) scores = clf.scores_ print 'Index : score' sortedIdx = [i[0] for i in sorted(enumerate(scores), key=lambda x:x[1], reverse=True)] top = 30 for i in range(top): print str(sortedIdx[i]) + ' : ' + str(scores[sortedIdx[i]]) lr = LogisticRegression() lr.fit(clf.transform(X), y) pred = lr.predict(clf.transform(X_test)) accuracy = sum(pred == y_test)/y_test.size print 'Logistic Regression Accuracy: ' + str(accuracy)
y_inv = Counter(lb_encoder.inverse_transform(y)) print("Classes:", y_inv) # 'Normalize/Scale features if needed. Our data is standardized by default' # X = StandardScaler(copy=False).fit_transform(X) Fwe = SelectFwe(alpha=0.01).fit(X,y) X=Fwe.transform(X) featureNames=featureNames[Fwe.get_support()] print("F-test filter ->",X.shape) FeatSelection_SVM=True FeatSelection_RandLogReg=False if FeatSelection_RandLogReg == True: LogRegFeats = RandomizedLogisticRegression(C=5, scaling=0.5, sample_fraction=0.8, n_resampling=60, selection_threshold=0.2,n_jobs=-1) X = LogRegFeats.fit_transform(X,y) featureNames=featureNames[LogRegFeats.get_support()] print("RandomizedLogisticRegression Feature Selection ->:",X.shape) elif FeatSelection_SVM == True: X= LinearSVC(C=1, penalty="l1", dual=False,class_weight='auto').fit_transform(X, y) # X= LogisticRegression(C=0.01,class_weight='auto').fit_transform(X, y) featureNames=featureNames[LogRegFeats.get_support()] print ("SVC Transformed X:",X.shape) ''' print("Plot #Feats vs Classification performance:") PlotPerfPercentFeatures(X_LR,y,est=SVC(C=100)) '''
def GetAllPerf (filePaths=None): if filePaths is None: filePaths = list(find_files(directory='./test_seq', pattern='trainingSetFeatures.csv')) #Sanity check: # filePaths=['/a/fr-05/vol/protein/danofer/ProtFeat/feat_extract/test_seq/Thermophile'] # filePaths=['./test_seq/NP/NP2/Train/trainingSetFeatures.csv'] print("FilePaths: \n",filePaths) fileNames=fileNameFromPaths (filePaths) print("FileNames:",fileNames) resDict = pd.DataFrame(index=fileNames, columns=['Accuracy','Accuracy_SD', 'f1','f1_SD','dummy_freq:Accuracy','dummy_freq:f1', 'LargestClassPercent','Classes', # 'TopRFE-Features','Best (f1) Model parameters', '# Classes', 'Array-Acc-Scores' ,'Array-f1-Scores' ,'bestML-Acc','bestML-f1','dummy_freq_f1_weighted']) #redDict holds results for each file/class, for saving to output-file i=-1 for filePath in filePaths: i +=1 'http://pythonconquerstheuniverse.wordpress.com/2008/06/04/gotcha-%E2%80%94-backslashes-in-windows-filenames/' filePath = os.path.normpath(filePath) print(filePath) fileName=str(fileNames[i]) #Str added now 14.1 print("fileName: %s" %(fileName)) "resDict['Name']= fileName" # filePath = str(argv[1]) # X, y, lb_encoder,featureNames = load_data(filePath+fileName, 'file') # X, y = features, labels X, y, lb_encoder,featureNames = load_data(filePath, 'file') # X, y = features, labels print(X.shape,"= (samples, features)") y_inv = Counter(lb_encoder.inverse_transform(y)) MajorityPercent = round(100*y_inv.most_common()[0][1]/sum(y_inv.values()),1) print("Classes:", lb_encoder.classes_) print("MajorityClassPercent:", MajorityPercent) resDict.LargestClassPercent[fileName] = MajorityPercent resDict.Classes[fileName] = str(lb_encoder.classes_) resDict["# Classes"][fileName]=len(lb_encoder.classes_) KFilt=None KFilt=350 #This is just temporary for the outputs - saves computation time. Barely filters compared to the model itself. if KFilt is not None: k = SelectKBest(k=KFilt).fit(X,y) X=k.transform(X) featureNames=featureNames[k.get_support()] Fwe = SelectFwe(alpha=0.01).fit(X,y) X=Fwe.transform(X) featureNames=featureNames[Fwe.get_support()] print("X reduced to K best features: ",X.shape) FeatSelection_SVM=False #Feature Names need updating!! FeatSelection_RandLogReg=False if FeatSelection_RandLogReg == True: LogRegFeats = RandomizedLogisticRegression(C=10, scaling=0.5, sample_fraction=0.95, n_resampling=40, selection_threshold=0.2,n_jobs=-1).fit(X,y) X_L1 = LogRegFeats.transform(X) featureNames=featureNames[LogRegFeats.get_support()] print("RandomizedLogisticRegression Feature Selection ->:",X_L1.shape) elif FeatSelection_SVM == True: svc_L1= LinearSVC(C=30, penalty="l2", dual=False,class_weight='auto').fit(X, y) X_L1 = svc_L1.transform(X, y) featureNames=featureNames[list(set(np.where(svc_L1.coef_ != 0)[-1]))] print ("L1 SVM Transformed X:",X_L1.shape) # X=X_L1 ''' print("Performance as a function of percent of features used:") PlotPerfPercentFeatures(X,y,est=LinearSVC()) ''' 'EG - graph best features; feature selection using RF, ensemble classifiers..' 'http://nbviewer.ipython.org/github/herrfz/dataanalysis/blob/master/assignment2/samsung_data_prediction_submitted.ipynb' RFE_FeatsToKeep = 16 FeatSelection_RFE=False FeatSelection_RFECV=False if (FeatSelection_RFE or FeatSelection_RFECV) == True: 'RFE + - best feats' 'http://scikit-learn.org/stable/auto_examples/plot_rfe_with_cross_validation.html ' svc = LinearSVC(class_weight='auto')#,penalty='l1',dual=False) # svc = LogisticRegression(class_weight='auto')#,C=1) if FeatSelection_RFECV==True: rfecv = RFECV(estimator=svc, step=RFE_FeatsToKeep,scoring='average_precision') # ,cv=StratifiedShuffleSplit(y,n_iter=3,test_size=0.3)) #,scoring='f1',verbose=0) # " scoring='roc_auc','recall','f1',accuracy..." else: rfecv = RFE(estimator=svc,n_features_to_select=RFE_FeatsToKeep, step=0.03) rfecv.fit(X, y) if FeatSelection_RFECV==True: print("RFE-CV selected %d features : " % (rfecv.n_features_)) print("RFE (%d features) scorer : " % (rfecv.n_features_),rfecv.score(X, y) ) rfe_featnames = featureNames[rfecv.get_support()] featureNames = featureNames[rfecv.get_support()] print("RFE selected feature names:",rfe_featnames) X_RFE = rfecv.fit_transform(X, y) print("X_RFE",X_RFE.shape) resDict['TopRFE-Features'][fileName]=str(rfe_featnames) 'Set GetRFEPerf To true or by user, if perf. of reduced set wanted' GetRFEPerf=False # print("lb_encoder.classes_",lb_encoder.classes_) 'Blind score boxplot graphic example using Seaborn: http://nbviewer.ipython.org/github/cs109/2014/blob/master/homework-solutions/HW5-solutions.ipynb ' 'Confusion matrixes + Dummies - http://bugra.github.io/work/notes/2014-11-22/an-introduction-to-supervised-learning-scikit-learn/' 'http://scikit-learn.org/stable/modules/model_evaluation.html#dummy-estimators' "http://blog.yhathq.com/posts/predicting-customer-churn-with-sklearn.html" print() "Make custom F1 scorer. May not have fixed problem!" from sklearn.metrics.score import make_scorer f1_scorer = make_scorer(metrics.f1_score, greater_is_better=True, average="micro") #Maybe another metric? May NOT be fixed!?. #weighted, micro, macro, none # print("Dummy classifiers output:") dummy_frequent = DummyClassifier(strategy='most_frequent',random_state=0) y_dummyPred = Get_yPred(X,y,clf_class=dummy_frequent) dummy_freq_acc = '{:.3}'.format(metrics.accuracy_score(y,y_dummyPred )) dummy_freq_f1 = '{:.3}'.format(metrics.f1_score(y, y_dummyPred,average='weighted')) dummy_freq_f1_weighted = '{:.3}'.format(f1_scorer(y, y_dummyPred)) #Get from ALL classes f1.. dummy_freq_f1_mean=(metrics.f1_score(y, y_dummyPred,average=None)).mean() # print("Dummy, most frequent acc:",dummy_freq_acc) # dummy_stratifiedRandom = DummyClassifier(strategy='stratified',random_state=0) # dummy_strat2= '{:.3%}'.format(metrics.accuracy_score(y, Get_yPred(X,y,clf_class=dummy_frequent))) #,sample_weight=balance_weights(y))) # 'print("Dummy, Stratified Random:",dummy_strat2)' print() resDict['dummy_freq:Accuracy'][fileName]=dummy_freq_acc ## resDict['dummy_freq:f1'][fileName]=dummy_freq_f1 dummy_freq_f1_mean resDict['dummy_freq:f1'][fileName]=dummy_freq_f1_mean resDict['dummy_freq_f1_weighted'][fileName]=dummy_freq_f1_weighted # resDict.dummy_Stratfreq[fileName]=dummy_strat2 "We can get seperately the best model for Acc, and the best for f1!" "WARNING!? In binary case - default F1 works for the 1 class, in sklearn 15. and lower" # bestEst_f1,bestScore_f1 = ModelParam_GridSearch(X,y,cv=3,scoreParam = 'f1') "Temporary workaround until next SKlearn update of F1 metric:" # bestEst_f1,bestScore_f1 = ModelParam_GridSearch(X,y,cv=3,scoreParam = 'f1')f1_scorer bestEst_f1,bestScore_f1 = ModelParam_GridSearch(X,y,cv=3,scoreParam = f1_scorer) bestEst_acc,bestScore_acc = ModelParam_GridSearch(X,y,cv=2,scoreParam = 'accuracy') print("bestEst (f1):",bestEst_f1)#,"best f1",bestScore_f1) print("bestEst (f1):",bestEst_acc)#,"best acc",bestScore_acc) #Temp # bestEst_f1=bestEst_acc=bestEst = RandomForestClassifier(n_jobs=-1) if GetRFEPerf==True: bestEst_RFE,bestScore_RFE = ModelParam_GridSearch(X_RFE,y,cv=3,scoreParam = 'f1') "Modified to get 2 estimators" scores_acc = cross_val_score(estimator=bestEst_acc, X=X, y=y, cv=StratifiedShuffleSplit(y, n_iter=13, test_size=0.18), n_jobs=-1) #Accuracy print("Accuracy: %0.3f (+- %0.2f)" % (scores_acc.mean(), scores_acc.std() * 2)) scores_f1 = cross_val_score(estimator=bestEst_f1, X=X, y=y, cv=StratifiedShuffleSplit(y, n_iter=13, test_size=0.18), n_jobs=-1, scoring='f1') print("f1: %0.3f (+- %0.2f)" % (scores_f1.mean(), scores_f1.std() * 2)) resDict['Accuracy'][fileName]=round(scores_acc.mean(),4) resDict['Accuracy_SD'][fileName]=round(scores_acc.std(),4) resDict['f1'][fileName]=round(scores_f1.mean(),4) resDict['f1_SD'][fileName]=round(scores_f1.std(),4) resDict['Array-f1-Scores'][fileName]=(scores_f1) resDict['Array-Acc-Scores'][fileName]=(scores_acc) resDict['bestML-f1'][fileName]=(str(bestEst_f1)) resDict['bestML-Acc'][fileName]=(str(bestEst_acc)) #ORIG # Acc,Acc_SD,f1,f1_SD = CV_multi_stats(X, y, bestEst,n=15) # resDict['Accuracy'][fileName]=round(Acc,4) # resDict['Accuracy_SD'][fileName]=round(Acc_SD,4) # resDict['f1 score'][fileName]=round(f1,4) # resDict['f1_SD'][fileName]=round(f1_SD,4) # resDict['Best (f1) Model parameters'][fileName]= bestEst print() # print(fileName," Done") print("Saving results to file") resDict.to_csv("OutputData.tsv", sep=',')
# -*- coding:utf-8 -*- # 逻辑回归:自动建模 import pandas as pd from sklearn.linear_model import LogisticRegression as LR from sklearn.linear_model import RandomizedLogisticRegression as RLR data = pd.read_excel("c://mldata//bankloan.xls", header=0) # x = data.iloc[:, :8].as_matrix() # y = data.iloc[:, 8].as_matrix() 和下边的两种读取数据的方式,都会带来精度的影响 train_data = data.values # 将读取的数据其转换为矩阵形式 train_x = train_data[0::, :8] train_label = train_data[0::, 8] rlr = RLR() # 建立随机回归模型,筛选变量 rlr.fit(train_x, train_label) # 训练模型 rlr.get_support() # 获取特征筛选结果 print u"特征筛选结束" print u"有效特征为:%s" % u'、'.join(data.columns[rlr.get_support()]) x = data[data.columns[rlr.get_support()]].as_matrix() # 筛选好的特征 lr = LR() lr.fit(x, train_label) # 用筛选好的特征数据来训练模型 print u'逻辑回归训练结束' print u'模型的平均正确率为:%s' % lr.score(x, train_label)
# Useful sources: # http://scikit-learn.org/stable/modules/generated/sklearn.linear_model.RandomizedLogisticRegression.html#sklearn.linear_model.RandomizedLogisticRegression # http://scikit-learn.org/stable/modules/generated/sklearn.linear_model.LogisticRegressionCV.html#sklearn.linear_model.LogisticRegressionCV from sklearn.linear_model import RandomizedLogisticRegression, LogisticRegression #, LogisticRegressionCV from sklearn.datasets import load_iris import numpy as np iris = load_iris() X, y = iris.data, iris.target print(X) print(y) ff_model = RandomizedLogisticRegression() # Finds best set of features X_new = ff_model.fit_transform(X, y) # Fit data and get transformed input rows print(X_new) print(X.shape) print(X_new.shape) print(X[0:4]) print(ff_model.transform(X[0:4])) # Transform the first 4 rows of data to get only best features model = LogisticRegression().fit(X_new, y) # Fit logistic regression with best features print(model.predict_proba(ff_model.transform(X[0:4]))) # predict probabilities for first 4 rows of data print(ff_model.inverse_transform(ff_model.transform(X[0:4]))) # Test inverse transforming arr = np.array([[1,1,1]]) print(ff_model.inverse_transform(arr)) # Get original matrix structure with 1's only in columns of retained features.
#-*- coding: utf-8 -*- #逻辑回归 自动建模 import pandas as pd from sklearn.linear_model import LogisticRegression as LR from sklearn.linear_model import RandomizedLogisticRegression as RLR #参数初始化 filename = '../data/bankloan.xls' data = pd.read_excel(filename) x = data.iloc[:,:8].as_matrix()#8个属性 y = data.iloc[:,8].as_matrix()#第九列 结果标签 #稳定性选择方法 挑选特征 rlr = RLR(selection_threshold=0.5) #建立随机逻辑回归模型,筛选变量 特征筛选用了默认阈值0.25 rlr.fit(x, y) #训练模型 rlr.get_support() #获取特征筛选结果 print(u'通过随机逻辑回归模型筛选特征结束。') print(u'有效特征为:%s' % ','.join(data.columns[rlr.get_support()])) x = data[data.columns[rlr.get_support()]].as_matrix() #筛选好特征,重新训练模型 lr = LR() #建立逻辑货柜模型 lr.fit(x, y) #用筛选后的特征数据来训练模型 print(u'逻辑回归模型训练结束。') print(u'模型的平均正确率为:%s' % lr.score(x, y))
#-*- coding: utf-8 -*- #逻辑回归 自动建模 import pandas as pd #参数初始化 filename = '../data/bankloan.xls' data = pd.read_excel(filename) x = data.iloc[:,:8].as_matrix() y = data.iloc[:,8].as_matrix() from sklearn.linear_model import LogisticRegression as LR from sklearn.linear_model import RandomizedLogisticRegression as RLR rlr = RLR() #建立随机逻辑回归模型,筛选变量 rlr.fit(x, y) #训练模型 rlr.get_support() #获取特征筛选结果,也可以通过.scores_方法获取各个特征的分数 print(u'通过随机逻辑回归模型筛选特征结束。') print(u'有效特征为:%s' % ','.join(data.columns[rlr.get_support()])) x = data[data.columns[rlr.get_support()]].as_matrix() #筛选好特征 lr = LR() #建立逻辑货柜模型 lr.fit(x, y) #用筛选后的特征数据来训练模型 print(u'逻辑回归模型训练结束。') print(u'模型的平均正确率为:%s' % lr.score(x, y)) #给出模型的平均正确率,本例为81.4%
# 代码清单5-1 逻辑回归代码 import pandas as pd # 参数初始化 fileName = 'data/bankloan.xls' data = pd.read_excel(fileName) x = data.iloc[:,:8].as_matrix() y = data.iloc[:,8].as_matrix() # 逻辑回归模型 from sklearn.linear_model import LogisticRegression as LR # 随机逻辑回归模型 from sklearn.linear_model import RandomizedLogisticRegression as RLR # 建立随机逻辑回归模型,筛选变量 rlr = RLR() # 训练模型 rlr.fit(x,y) # 获取特筛选结果,也可以通过.score_方法获取各个特征的分数 rlr.get_support() print(u'通过随机逻辑回归模型筛选特征结束。') print(u'有效特征为 %s' %'.'.join(data.columns[rlr.get_support()])) # 筛选好特征 x = data[data.columns[rlr.get_support()]].as_matrix() # 建立逻辑回归模型 lr = LR() # 用筛选后的特征数据来训练模型 lr.fit(x,y) print(u'逻辑回归模型训练结束。') # 给出模型的平均正确率,本例为81.48
import pandas as pda fname = "C:/Users/Administrator/Desktop/data/luqu.xls" dataf = pda.read_excel(fname) #DataFrame.as_matrix: Convert the frame to its Numpy-array representation #DataFrame.iloc: Purely integer-location based indexing for selection by position x = dataf.iloc[:, 1:4].as_matrix() y = dataf.iloc[:, 0:1].as_matrix() from sklearn.linear_model import LogisticRegression as LR from sklearn.linear_model import RandomizedLogisticRegression as RLR r1 = RLR() r1.fit(x, y) eff = r1.get_support()#find the effective features, remove noneffective ones #print(dataf.columns[eff]) t = dataf[dataf.columns[r1.get_support()]].as_matrix() r2 = LR() r2.fit(t, y) print("training ends") print("accuracy: " + str(r2.score(x,y))) #score():Returns the mean accuracy on the given test data and labels
def evaluate_model(model, X, y, labels, save_features=False, group_ablation=False, feature_output_name="features.csv", ticks=XTICKS): model_fs = RandomizedLogisticRegression(C=1, random_state=1) # Split into folds using labels # label_kfold = LabelKFold(labels, n_folds=10) group_kfold = GroupKFold(n_splits=10).split(X,y,groups=labels) folds = [] # For feature analysis feat_scores = [] # For ablation study # Group ablation study feature_groups = feature_sets.get_all_groups() ablated = {key: set() for key in feature_groups.keys()} roc_ab = {key: list() for key in feature_groups.keys()} roc_ab['true_roc_score'] = [] for train_index, test_index in group_kfold: print "processing fold: %d" % (len(folds) + 1) # Split X_train, X_test = X.values[train_index], X.values[test_index] y_train, y_test = y.values[train_index], y.values[test_index] scores = [] for k in XTICKS: indices = util.get_top_pearson_features(X_train, y_train, k) # Select k features X_train_fs = X_train[:, indices] X_test_fs = X_test[:, indices] model = model.fit(X_train_fs, y_train) # summarize the selection of the attributes yhat = model.predict(X_test_fs) # Predict scores.append(f1_score(y_test, yhat)) # Save if group_ablation: true_roc_score = roc_auc_score(y_test, yhat) roc_ab['true_roc_score'].append(true_roc_score) for group in feature_groups.keys(): # Get group features features = feature_groups[group] features_idx = util.get_column_index(features, X) # Get indices indices_ab = [i for i in indices if i not in features_idx] removed_indices = [i for i in indices if i in features_idx] # Filter X_train_ab = X_train[:, indices_ab] X_test_ab = X_test[:, indices_ab] # Fit model_ab = model.fit(X_train_ab, y_train) # Predict yhat_ab = model_ab.predict(X_test_ab) # Save ablated[group].update(X.columns[removed_indices]) roc_ab_score = roc_auc_score(y_test, yhat_ab) roc_ab[group].append(roc_ab_score - true_roc_score) # ----- save row ----- folds.append(scores) # ----- save row ----- # ----- save features ----- if save_features: model_fs = model_fs.fit(X_train, y_train) feat_scores.append(model_fs.scores_) # -------------------- if save_features: feat_scores = np.asarray(feat_scores) # convert to np array feat_scores = feat_scores.mean(axis=0) # squash # This command maps scores to features and sorts by score, with the feature name in the first position feat_scores = sorted(zip(X.columns, map(lambda x: round(x, 4), model_fs.scores_)), reverse=True, key=lambda x: x[1]) feat_scores = pd.DataFrame(feat_scores) csv_path = "output/feature_scores/" + feature_output_name feat_scores.to_csv(csv_path, index=False) util.print_full(feat_scores) if group_ablation: roc_ab = pd.DataFrame(roc_ab).mean() print "=======================" print "True AUC Score: %f" % roc_ab['true_roc_score'] print "=======================\n\n" for group in ablated.keys(): print "-----------------------" print "Group: %s " % group print "Removed: %s" % list(ablated[group]) print "Change in AUC: %f" % (roc_ab[group]) print "-----------------------\n" folds = np.asarray(folds) return folds
from sklearn.linear_model import LogisticRegression from sklearn.feature_selection import SelectFromModel from sklearn.svm import SVC from sklearn.cross_validation import StratifiedKFold from sklearn.linear_model import RandomizedLogisticRegression import fmriUtils as fm #自定义函数 n_folds = 10 f = fm.outTo() #输出重定向到文件 X,y = fm.loadData2() X2,y2 = fm.loadData2() y = fm.defineClass(y) randomized_logistic = RandomizedLogisticRegression(C=0.1,n_jobs=2) randomized_logistic.fit(X,y) XX = randomized_logistic.transform(X) print "============选择后剩余的特征================" print XX.shape yy = y cv = StratifiedKFold(yy,n_folds) cv_scores = [] for train, test in cv: svc = SVC(kernel='linear') svc.fit(XX[train], yy[train]) prediction = svc.predict(XX[test]) cv_scores.append( np.sum(prediction == yy[test]) / float(np.size(yy[test])) ) print "========分类准确率======="