def get_support_fields(X,Y): ''' Function for getting support fields ''' rlr = RLR() #建立随机逻辑回归模型,筛选变量 rlr.fit(X, Y) #训练模型 rlr.get_support() #获取特征筛选结果,也可以通过.scores_方法获取各个特征的分数 print rlr.scores_ print(u'有效特征为:%s' % (','.join(data.columns[rlr.get_support()])).decode('utf-8')) X = data[data.columns[rlr.get_support()]].as_matrix() #筛选好特征 return X
def randomized_Logistic_regression(self): X = self.data[:,1:len(self.data[0])] y = self.data[:,0] randomized_logistic = RandomizedLogisticRegression() randomized_logistic.fit(X,y) a = randomized_logistic.get_support() selected = np.where(a)
def randomized_Logistic_regression(self): X = self.data[:, 1:len(self.data[0])] y = self.data[:, 0] randomized_logistic = RandomizedLogisticRegression() randomized_logistic.fit(X, y) a = randomized_logistic.get_support() selected = np.where(a)
def get_features(X_train, y_train, names, selection_threshold=0.2): print('\ngetting features with randomized logistic regression...') print('using a selection threshold of {}'.format(selection_threshold)) randomized_logistic = RandomizedLogisticRegression( selection_threshold=selection_threshold) randomized_logistic.fit(X_train, y_train) mask = randomized_logistic.get_support() features = np.array(names)[mask] print('found {} ngrams:'.format(len([f for f in features]))) print([f for f in features]) return features
def getElgiibleFeatures(allFeatureParam, allLabelParam): ''' reff for paper : http://scikit-learn.org/stable/modules/feature_selection.html#randomized-l1 http://scikit-learn.org/stable/modules/generated/sklearn.linear_model.RandomizedLogisticRegression.html ''' logiRegObj = RandomizedLogisticRegression() logiRegObj.fit(allFeatureParam, allLabelParam) ### Output ### #print "Model score: ", logiRegObj.scores_ eligible_indices = logiRegObj.get_support(indices=True) return eligible_indices
def logistic(X_train, X_test, y_train, y_test): from sklearn.linear_model import LogisticRegression as LR from sklearn.linear_model import RandomizedLogisticRegression as RLR #特征工程 rlr = RLR() rlr.fit(X_train, y_train) print(rlr.get_support()) x = X_train[X_train.columns[rlr.get_support()]].as_matrix() x_test = X_test[X_test.columns[rlr.get_support()]].as_matrix() ''' x=X_train x_test=X_test ''' #逻辑回归 lr = LR() lr.fit(x, y_train) pred_prob_train = lr.predict_proba(x) pred_prob = lr.predict_proba(x_test) print('logistic') predicts = lr.predict(x_test) metrics_result(y_test, predicts) return pred_prob, pred_prob_train
def logistic_regression(): # 参数初始化 filename = SRC_PATH + '/data/bankloan.xls' data = pd.read_excel(filename) print data.head() print data.tail() x = data.iloc[:, :8].as_matrix() y = data.iloc[:, 8].as_matrix() print x, y rlr = RLR() # 建立随机逻辑回归模型,筛选变量 rlr.fit(x, y) # 训练模型 rlr.get_support() # 获取特征筛选结果,也可以通过.scores_方法获取各个特征的分数 print(u'通过随机逻辑回归模型筛选特征结束。') # print(u'有效特征为:%s' % ','.join(data.columns[rlr.get_support()])) # x = data[data.columns[rlr.get_support()]].as_matrix() # 筛选好特征 lr = LR() # 建立逻辑货柜模型 lr.fit(x, y) # 用筛选后的特征数据来训练模型 print(u'逻辑回归模型训练结束。') print(u'模型的平均正确率为:%s' % lr.score(x, y)) # 给出模型的平均正确率,本例为81.4%
def programmer_1(): filename = "data/bankloan.xls" data = pd.read_excel(filename) x = data.iloc[:, :8].as_matrix() y = data.iloc[:, 8].as_matrix() rlr = RLR() rlr.fit(x, y) rlr_support = rlr.get_support() support_col = data.drop('违约', axis=1).columns[rlr_support] print( "rlr_support_columns: {columns}".format(columns=','.join(support_col))) x = data[support_col].as_matrix() lr = LR() lr.fit(x, y) print("lr: {score}".format(score=lr.score(x, y)))
def programmer_1(): filename = "data/bankloan.xls" data = pd.read_excel(filename) x = data.iloc[:, :8].as_matrix() y = data.iloc[:, 8].as_matrix() rlr = RLR() rlr.fit(x, y) rlr_support = rlr.get_support() support_col = data.drop('违约', axis=1).columns[rlr_support] print( "rlr_support_columns: {columns}".format(columns=','.join(support_col))) x = data[support_col].as_matrix() lr = LR() lr.fit(x, y) print("lr: {score}".format(score=lr.score(x, y)))
def tipdm_chapter5_test(): # 参数初始化 filename = '../../../MyFile/chapter5/data/bankloan.xls' data = pd.read_excel(filename) x = data.iloc[:,:8].as_matrix() y = data.iloc[:,8].as_matrix() # feature selection rlr = RLR() # 建立随机逻辑回归模型,筛选变量 rlr.fit(x, y) # 训练模型 features = rlr.get_support() # 获取特征筛选结果,也可以通过.scores_方法获取各个特征的分数 print(u'通过随机逻辑回归模型筛选特征结束。') print(u'有效特征为: {0}'.format(','.join(data.columns[features]))) x = data[data.columns[features]].as_matrix() # 筛选好特征 # training and test lr = LR() # 建立逻辑货柜模型 lr.fit(x, y) # 用筛选后的特征数据来训练模型 print(u'逻辑回归模型训练结束。') print(u'模型的平均正确率为: {0}'.format(lr.score(x, y))) # 给出模型的平均正确率
def programmer_1(): # 参数初始化 filename = r'bankloan.xls' data = pd.read_excel(filename) x = data.iloc[:, :8].as_matrix() # 使用pandas读取文件 就可以不用管label column标签 y = data.iloc[:, 8].as_matrix() rlr = RLR() # 建立随机逻辑回归模型,进行特征选择和变量筛选 rlr.fit(x, y) # 训练模型 egeList = rlr.get_support() # 获取筛选后的特征 egeList = np.append( egeList, False) # 往numpy数组中 添加一个False元素 使用np.append(array,ele)方法 print("rlr.get_support():") print(egeList) print(u'随机逻辑回归模型特征选择结束!!!') print(u'有效特征为:%s' % ','.join(data.columns[egeList])) x = data[data.columns[egeList]].as_matrix() # 筛选好特征值 lr = LR() # 建立逻辑回归模型 lr.fit(x, y) # 用筛选后的特征进行训练 print(u'逻辑回归训练模型结束!!!') print(u'模型的平均正确率:%s' % lr.score(x, y)) # 给出模型的平均正确率,本例为81.4%
def pick_variables(self, descover=True, method="rlr", threshold=0.25, auto_pick=True): #默认阈值0.25 #挑选变量助手(特征选择) if method == "rlr": """ #顶层特征选择算法 #随机逻辑回归选择与y线性关系的变量(稳定性选择1)。 #在不同数据子集和特征子集上运行特征选择算法(rlr),最终汇总选择结果 #不同的子集上建立模型,然后汇总最终确定特征得分 稳定性选择是一种基于二次抽样和选择算法相结合较新的方法,选择算法可以是回归、SVM或其他类似的方法。 它的主要思想是在不同的数据子集和特征子集上运行特征选择算法,不断的重复,最终汇总特征选择结果, 比如可以统计某个特征被认为是重要特征的频率(被选为重要特征的次数除以它所在的子集被测试的次数)。 理想情况下,重要特征的得分会接近100%。稍微弱一点的特征得分会是非0的数,而最无用的特征得分将会接近于0。 RandomizedLogisticRegression() fit(X, y) Fit the model using X, y as training data. fit_transform(X[, y]) Fit to data, then transform it. get_params([deep]) Get parameters for this estimator. get_support([indices]) Get a mask, or integer index, of the features selected inverse_transform(X) Reverse the transformation operation set_params(**params) Set the parameters of this estimator. transform(X) Reduce X to the selected features. """ rlr = RandomizedLogisticRegression( selection_threshold=threshold) #随机逻辑回归 rlr.fit(self.X_train, self.y_train) scoretable = pd.DataFrame(rlr.all_scores_, index=self.X_train.columns, columns=['var_score']) #汇总最终确定特征得分 columns_need = list(self.X_train.columns[rlr.get_support( )]) # Get a mask, or integer index, of the features selected self.X_train = self.X_train[columns_need] self.X_test = self.X_test[columns_need] columns_need.append("y") if auto_pick: self.picked_data = self.data[columns_need] return scoretable
def data_proc(self): self.load_data() # iloc,完全基于位置的索引,[]中的第一个值是从第几行到第几行,第二个是从第几列到第几列 x = self.data.iloc[:, :8].as_matrix() y = self.data.iloc[:, 8].as_matrix() #先使用随机变量模型进行属性的筛选 rlr = RLR() rlr.fit(x, y) #训练模型 rlr.get_support() #获取特征筛选结果,也可以通过.scores获得各个特征的分数 print("有效特征为%s" % ','.join(self.data.columns[rlr.get_support()])) x = self.data[data.columns[rlr.get_support()]].as_matrix() #筛选之后的特征 rlr.get_support() lr = LR(class_weight={ 0: 0.9, 1: 0.1 }) # 分类权重,避免误分类代价比较高时使用,class_weight='balanced'自行处理,或者像代码中那样设置 #lr.fit(x, y,sample_weight=[1,2,3,5,4,9,8,10]) lr.fit(x, y, sample_weight=[1, 2, 3, 5, 4]) #样本权重,设置每一行数据的重要性,一行数据一个值 result = lr.predict([[24, 2, 2, 0, 28, 17.3, 1.79, 3.06]]) print('模型的正确率是:%s,预测结果是 %d' % (lr.score(x, y), result))
import pandas as pda fname = "C:/Users/Administrator/Desktop/data/luqu.xls" dataf = pda.read_excel(fname) #DataFrame.as_matrix: Convert the frame to its Numpy-array representation #DataFrame.iloc: Purely integer-location based indexing for selection by position x = dataf.iloc[:, 1:4].as_matrix() y = dataf.iloc[:, 0:1].as_matrix() from sklearn.linear_model import LogisticRegression as LR from sklearn.linear_model import RandomizedLogisticRegression as RLR r1 = RLR() r1.fit(x, y) eff = r1.get_support()#find the effective features, remove noneffective ones #print(dataf.columns[eff]) t = dataf[dataf.columns[r1.get_support()]].as_matrix() r2 = LR() r2.fit(t, y) print("training ends") print("accuracy: " + str(r2.score(x,y))) #score():Returns the mean accuracy on the given test data and labels
#-*- coding: utf-8 -*- #逻辑回归 自动建模 import pandas as pd from sklearn.linear_model import LogisticRegression as LR from sklearn.linear_model import RandomizedLogisticRegression as RLR #参数初始化 filename = '../data/bankloan.xls' data = pd.read_excel(filename) x = data.iloc[:,:8].as_matrix()#8个属性 y = data.iloc[:,8].as_matrix()#第九列 结果标签 #稳定性选择方法 挑选特征 rlr = RLR(selection_threshold=0.5) #建立随机逻辑回归模型,筛选变量 特征筛选用了默认阈值0.25 rlr.fit(x, y) #训练模型 rlr.get_support() #获取特征筛选结果 print(u'通过随机逻辑回归模型筛选特征结束。') print(u'有效特征为:%s' % ','.join(data.columns[rlr.get_support()])) x = data[data.columns[rlr.get_support()]].as_matrix() #筛选好特征,重新训练模型 lr = LR() #建立逻辑货柜模型 lr.fit(x, y) #用筛选后的特征数据来训练模型 print(u'逻辑回归模型训练结束。') print(u'模型的平均正确率为:%s' % lr.score(x, y))
#-*- coding:utf-8 -*- # Peishichao import pandas as pd filename = '../data/bankloan.xls' data = pd.read_excel(filename) x = data.iloc[:, :8].as_matrix() y = data.iloc[:, 8].as_matrix() from sklearn.linear_model import LogisticRegression as LR from sklearn.linear_model import RandomizedLogisticRegression as RLR rlr = RLR() rlr.fit(x, y) rlr.get_support() print(rlr.get_support()) print('end') #print('Feature: %s ' % ','.join(data.columns[rlr.get_support()])) x = data[data.columns[rlr.get_support()]].as_matrix() print(x) lr = LR() lr.fit(x, y) print('end') print('accur: %s' % lr.score(x, y))
def runTest(featmat_train, outcome_train_lbl, featmat_test, outcome_test_lbl, sel, paramsDict, bestmodelnum): print("Running Test for #{0} ({1})".format(TEST_PERSON_NUM, TEST_PERSON_DEVICE_ID)) X_train_allfg = featmat_train.values Y_train = outcome_train_lbl.values # Y_train = Y_train.reshape(Y_train.size, 1)# does this help? featnames_allfg = featmat_train.columns X_test_allfg = featmat_test.values Y_test = outcome_test_lbl.values Y_true = Y_test[0] sel_featnames_per_fg = {} sel_featnames_list_ordered = [] sel_X_train = [] sel_X_test = [] countNumSel = 0 fgi = 0 for s in suffix_list: fgi = fgi + 1 # print fgi, suffix_list_str = ",".join(s) fgidxs = fgColIdxs[suffix_list_str] X_train = X_train_allfg[:, fgidxs] X_test = X_test_allfg[:, fgidxs] featnames_fg = featnames_allfg[fgidxs] # continue if empty if X_train.shape[1] == 0: continue ## scaling scaler = StandardScaler() X_train = scaler.fit_transform(X_train) X_test = scaler.transform(X_test) # variance thresholding vartransform = VarianceThreshold() X_train = vartransform.fit_transform(X_train) X_test = vartransform.transform(X_test) varthres_support = vartransform.get_support() featnames_fg = featnames_fg[varthres_support] ## feature selection if sel == "rlog": #print (X_train.shape) randomized_rlog = RandomizedLogisticRegression(**paramsDict) X_train = randomized_rlog.fit_transform(X_train, Y_train) X_test = randomized_rlog.transform(X_test) chosen_col_idxs = randomized_rlog.get_support() #print (len(featnames_fg)) #print (len(chosen_col_idxs)) if len(chosen_col_idxs) > 0: featnames_fg_chosen = list(featnames_fg[chosen_col_idxs]) sel_featnames_per_fg[suffix_list_str] = featnames_fg_chosen sel_featnames_list_ordered = sel_featnames_list_ordered + featnames_fg_chosen sel_X_train.append(X_train) sel_X_test.append(X_test) countNumSel = countNumSel + len(featnames_fg_chosen) else: raise ("Unrecognized sel (feature selection algorithm)") ## feature selection: sel{sel{fg1}.....sel{fg45}} X_train_concat = np.hstack(sel_X_train) X_test_concat = np.hstack(sel_X_test) print("\nSum of number of features selected from all fgs = {0}".format( countNumSel)) print("Concatenated X_train has {0} features".format( X_train_concat.shape[1])) print("Concatenated X_test has {0} features".format( X_test_concat.shape[1])) if sel == "rlog": randomized_rlog = RandomizedLogisticRegression(**paramsDict) X_train_concat = randomized_rlog.fit_transform(X_train_concat, Y_train) X_test_concat = randomized_rlog.transform(X_test_concat) chosen_col_idxs = randomized_rlog.get_support() sel_featnames_list_ordered = np.array(sel_featnames_list_ordered) chosen_col_idxs = np.array(chosen_col_idxs) chosen_cols_final = sel_featnames_list_ordered[chosen_col_idxs] else: raise ("Unrecognized sel (feature selection algorithm)") print("Final number of features in model = {0}".format( X_train_concat.shape[1])) # GBCT if modelname == "GBC": clf = GradientBoostingClassifier(random_state=0) elif modelname == "LOGR": clf = LogisticRegression(random_state=0, C=paramsDict["C"], tol=1e-3, penalty="l1", n_jobs=paramsDict["n_jobs"], intercept_scaling=1, class_weight="balanced") else: raise ("Unrecognized model name") clf.fit(X_train_concat, Y_train) pred = clf.predict(X_test_concat) pred_proba = clf.predict_proba(X_test_concat) Y_pred = pred[0] Y_pred_proba = pred_proba[0][1] ## Logging test_person_test.csv - outputs 1 line only ## did, sel, selParams, Y_pred, Y_pred_proba, Y_true, chosen_cols_final, suffix_list_str : sel_featnames_per_fg[suffix_list_str] in separate columns chosen_cols_final_str = ",".join(chosen_cols_final) paramsDict_str = ','.join("%s:%r" % (key, val) for (key, val) in paramsDict.iteritems()) fgIdxs_str = ','.join("%s:%r" % (key, val) for (key, val) in fgIdxs.iteritems()) cnts_per_lbl_dict = getValueCounts(outcome_train_lbl, outcome_test_lbl) cnts_per_lbl_str = ','.join("%s:%r" % (key, val) for (key, val) in cnts_per_lbl_dict.iteritems()) dfout = pd.DataFrame({ "did": [TEST_PERSON_DEVICE_ID], "cnts_per_lbl": [cnts_per_lbl_str], "sel": [sel], "selParams": [paramsDict_str], "Y_pred": [Y_pred], "Y_pred_proba": [Y_pred_proba], "Y_true": [Y_true], "fgIdxs": [fgIdxs_str], "sel_final": [chosen_cols_final_str] }) dfout = dfout.set_index("did") cols = [ "cnts_per_lbl", "sel", "selParams", "Y_pred", "Y_pred_proba", "Y_true", "fgIdxs", "sel_final" ] for s in suffix_list: suffix_list_str = ",".join(s) if suffix_list_str in sel_featnames_per_fg: sel_feats_fg_str = ",".join(sel_featnames_per_fg[suffix_list_str]) else: sel_feats_fg_str = "" dfcol = pd.DataFrame({ "did": [TEST_PERSON_DEVICE_ID], "sel_{0}".format(suffix_list_str): [sel_feats_fg_str] }) dfcol = dfcol.set_index("did") dfout = pd.concat([dfout, dfcol], axis=1) cols.append("sel_{0}".format(suffix_list_str)) dfout.to_csv( folderpath + "{0}_test_model{1}.csv".format(TEST_PERSON_DEVICE_ID, bestmodelnum), columns=cols, header=True) print("{0} minutes elapsed since start of program ".format( (time.time() - STARTTIME) / 60.0)) return (Y_pred, Y_pred_proba)
#-*- coding: utf-8 -*- #逻辑回归 自动建模 import pandas as pd #参数初始化 filename = '../data/bankloan.xls' data = pd.read_excel(filename) x = data.iloc[:, :8].as_matrix() y = data.iloc[:, 8].as_matrix() from sklearn.linear_model import LogisticRegression as LR from sklearn.linear_model import RandomizedLogisticRegression as RLR rlr = RLR() #建立随机逻辑回归模型,筛选变量 rlr.fit(x, y) #训练模型 rlr.get_support() #获取特征筛选结果,也可以通过.scores_方法获取各个特征的分数 print(u'通过随机逻辑回归模型筛选特征结束。') print(u'有效特征为:%s' % ','.join(data.columns[rlr.get_support()])) x = data[data.columns[rlr.get_support()]].as_matrix() #筛选好特征 lr = LR() #建立逻辑货柜模型 lr.fit(x, y) #用筛选后的特征数据来训练模型 print(u'逻辑回归模型训练结束。') print(u'模型的平均正确率为:%s' % lr.score(x, y)) #给出模型的平均正确率,本例为81.4%
Index(['年龄', '教育', '工龄', '地址', '收入', '负债率', '信用卡负债', '其他负债', '违约'], dtype='object') ''' features = b_data.iloc[:,:8] #print(type(features)) #<class 'pandas.core.frame.DataFrame'> features = features.as_matrix() #从pandas数据框转到numpy的ndarray #print(type(features)) #<class 'numpy.ndarray'> labels = b_data.iloc[:,8].as_matrix() randomized_logistic = RandomizedLogisticRegression() #随机logistic回归模型,用于筛选变量 randomized_logistic.fit(features,labels) #训练随机logistic回归模型 print(randomized_logistic.scores_) #获取各个特征的分数 ''' [ 0.105 0.085 1. 0.425 0. 1. 0.545 0.03 ] ''' print(randomized_logistic.get_support()) #随机logistic回归模型的筛选结果 ''' [False False True True False True True False] ''' #随机logistic回归模型属于稳定性选择中的一种 print('(稳定性选择)有效特征:%s'%','.join(b_data.columns[:-1][randomized_logistic.get_support()])) ''' (稳定性选择)有效特征:工龄,地址,负债率,信用卡负债 ''' feat_1 = b_data[b_data.columns[:-1][randomized_logistic.get_support()]].as_matrix() estimator = SVR(kernel="linear") RFE_selector = RFE(estimator=estimator, n_features_to_select=None, step=1) RFE_selector.fit(features,labels) print(RFE_selector.support_) '''
# -*- coding:utf-8 -*- import pandas as pd filename = '../data/bankloan.xls' data = pd.read_excel(filename) x = data.iloc[:,:8].as_matrix() y = data.iloc[:,8].as_matrix() from sklearn.linear_model import LogisticRegression as LR from sklearn.linear_model import RandomizedLogisticRegression as RLR rlr = RLR() #建立随机逻辑回归模型,复筛选变量 rlr.fit(x, y) #训练模型 rlr.get_support() #获取特征筛选变量 print(u'有效特征为:%s' % ','.join(data.columns[rlr.get_support()])) x = data[data.columns[rlr.get_support()]].as_matrix() #筛选锟斤拷锟斤拷锟斤拷 lr = LR() #建立逻辑回归模型 lr.fit(x, y) #训练模型 print(u'模型的平均正确率:%s' % lr.score(x, y))
X = Fwe.transform(X) featureNames = featureNames[Fwe.get_support()] print("F-test filter ->", X.shape) FeatSelection_SVM = True FeatSelection_RandLogReg = False if FeatSelection_RandLogReg == True: LogRegFeats = RandomizedLogisticRegression(C=5, scaling=0.5, sample_fraction=0.8, n_resampling=60, selection_threshold=0.2, n_jobs=-1) X = LogRegFeats.fit_transform(X, y) featureNames = featureNames[LogRegFeats.get_support()] print("RandomizedLogisticRegression Feature Selection ->:", X.shape) elif FeatSelection_SVM == True: X = LinearSVC(C=1, penalty="l1", dual=False, class_weight='auto').fit_transform(X, y) # X= LogisticRegression(C=0.01,class_weight='auto').fit_transform(X, y) featureNames = featureNames[LogRegFeats.get_support()] print("SVC Transformed X:", X.shape) ''' print("Plot #Feats vs Classification performance:") PlotPerfPercentFeatures(X_LR,y,est=SVC(C=100)) ''' KFilt = None # KFilt=200
# -*- coding: utf-8 -*- """ Created on Wed Jan 24 22:27:16 2018 @author: fan 第一题: data1 是40名癌症病人的一些生存资料,其中,X1表示生活行动能力评分(1~100),X2表示病人的年龄,X3表示由诊断到直入研究时间(月);X4表示肿瘤类型,X5把ISO两种疗法(“1”是常规,“0”是试验新疗法);Y表示病人生存时间(“0”表示生存时间小于200天,“1”表示生存时间大于或等于200天) 试建立Y关于X1~X5的logistic回归模型 """ from numpy import * import pandas as pd data = pd.read_table('data1.txt', encoding='gbk') x = data.iloc[:, 1:6].as_matrix() y = data.iloc[:, 6].as_matrix() from sklearn.linear_model import LogisticRegression as LR from sklearn.linear_model import RandomizedLogisticRegression as RLR rlr = RLR() rlr.fit(x, y) rlr.get_support() fit_x = data[data.columns[rlr.get_support()]].as_matrix() #筛选好特征 lr = LR() lr.fit(fit_x, y) lr.score(fit_x, y) #正确率水平为 75% ''' 第二题: data2 是关于重伤病人的一些基本资料。自变量X是病人的住院天数,因变量Y是病人出院后长期恢复 的预后指数,指数数值越大表示预后结局越好。 '''
for i in range(100): print("Working on: %s (%d of 100)" % (st, (i + 1))) rlr = RandomizedLogisticRegression( n_resampling=5000, C=lr_mean.C, selection_threshold=st, n_jobs=2) rlr.fit(X, y) X_rlr = rlr.transform(X) if X_rlr.size: cv_scores_rlr = cross_val_score( lr_mean, X_rlr, y, scoring="roc_auc", cv=StratifiedKFold(9)) rlr_tmp = { "st": st, "cv_score": cv_scores_rlr.mean(), "cv_std": cv_scores_rlr.std(), "n_features": sum(rlr.get_support()) } rlr_grid_search = rlr_grid_search.append( rlr_tmp, ignore_index=True) rlr_grid_search_mean = rlr_grid_search.groupby(by="st").mean() rlr_grid_search_mean["n_feat_std"] =\ rlr_grid_search.groupby(by="st").std()["n_features"] rlr_grid_search_mean["cv_score_std"] = rlr_grid_search.groupby( by="st").std()["cv_score"] rlr_grid_search_mean.to_csv("ispc_grid_search_mean.csv", index=False) rlr = RandomizedLogisticRegression( n_resampling=5000, C=lr_mean.C, selection_threshold=0.75) rlr.fit(X, y)
#!/usr/bin/env python # _*_ UTF-8 _*_ import pandas as pda fname = "F:/python_workspace/file/logic/luqu.csv" dataf = pda.read_csv(fname) # [行,列] x = dataf.iloc[:, 1:4].as_matrix() y = dataf.iloc[:, 0:1].as_matrix() from sklearn.linear_model import LogisticRegression as LR from sklearn.linear_model import RandomizedLogisticRegression as RLR # 建立一个逻辑回归模型 r1 = RLR() # 训练模型 r1.fit(x, y) # 特征值筛选,获取有效特征。 r1.get_support() # print(dataf.columns[r1.get_support()]) # 将可用的特征值参数转换成数组,用来预测y值。 t = dataf[dataf.columns[r1.get_support()]].as_matrix() r2 = LR() # 建立xy之间的关系并进行训练。 r2.fit(t, y) print("训练结束") print("模型正确率为:" + str(r2.score(x, y)))
def GetAllPerf (filePaths=None): if filePaths is None: filePaths = list(find_files(directory='./test_seq', pattern='trainingSetFeatures.csv')) #Sanity check: # filePaths=['/a/fr-05/vol/protein/danofer/ProtFeat/feat_extract/test_seq/Thermophile'] # filePaths=['./test_seq/NP/NP2/Train/trainingSetFeatures.csv'] print("FilePaths: \n",filePaths) fileNames=fileNameFromPaths (filePaths) print("FileNames:",fileNames) resDict = pd.DataFrame(index=fileNames, columns=['Accuracy','Accuracy_SD', 'f1','f1_SD','dummy_freq:Accuracy','dummy_freq:f1', 'LargestClassPercent','Classes', # 'TopRFE-Features','Best (f1) Model parameters', '# Classes', 'Array-Acc-Scores' ,'Array-f1-Scores' ,'bestML-Acc','bestML-f1','dummy_freq_f1_weighted']) #redDict holds results for each file/class, for saving to output-file i=-1 for filePath in filePaths: i +=1 'http://pythonconquerstheuniverse.wordpress.com/2008/06/04/gotcha-%E2%80%94-backslashes-in-windows-filenames/' filePath = os.path.normpath(filePath) print(filePath) fileName=str(fileNames[i]) #Str added now 14.1 print("fileName: %s" %(fileName)) "resDict['Name']= fileName" # filePath = str(argv[1]) # X, y, lb_encoder,featureNames = load_data(filePath+fileName, 'file') # X, y = features, labels X, y, lb_encoder,featureNames = load_data(filePath, 'file') # X, y = features, labels print(X.shape,"= (samples, features)") y_inv = Counter(lb_encoder.inverse_transform(y)) MajorityPercent = round(100*y_inv.most_common()[0][1]/sum(y_inv.values()),1) print("Classes:", lb_encoder.classes_) print("MajorityClassPercent:", MajorityPercent) resDict.LargestClassPercent[fileName] = MajorityPercent resDict.Classes[fileName] = str(lb_encoder.classes_) resDict["# Classes"][fileName]=len(lb_encoder.classes_) KFilt=None KFilt=350 #This is just temporary for the outputs - saves computation time. Barely filters compared to the model itself. if KFilt is not None: k = SelectKBest(k=KFilt).fit(X,y) X=k.transform(X) featureNames=featureNames[k.get_support()] Fwe = SelectFwe(alpha=0.01).fit(X,y) X=Fwe.transform(X) featureNames=featureNames[Fwe.get_support()] print("X reduced to K best features: ",X.shape) FeatSelection_SVM=False #Feature Names need updating!! FeatSelection_RandLogReg=False if FeatSelection_RandLogReg == True: LogRegFeats = RandomizedLogisticRegression(C=10, scaling=0.5, sample_fraction=0.95, n_resampling=40, selection_threshold=0.2,n_jobs=-1).fit(X,y) X_L1 = LogRegFeats.transform(X) featureNames=featureNames[LogRegFeats.get_support()] print("RandomizedLogisticRegression Feature Selection ->:",X_L1.shape) elif FeatSelection_SVM == True: svc_L1= LinearSVC(C=30, penalty="l2", dual=False,class_weight='auto').fit(X, y) X_L1 = svc_L1.transform(X, y) featureNames=featureNames[list(set(np.where(svc_L1.coef_ != 0)[-1]))] print ("L1 SVM Transformed X:",X_L1.shape) # X=X_L1 ''' print("Performance as a function of percent of features used:") PlotPerfPercentFeatures(X,y,est=LinearSVC()) ''' 'EG - graph best features; feature selection using RF, ensemble classifiers..' 'http://nbviewer.ipython.org/github/herrfz/dataanalysis/blob/master/assignment2/samsung_data_prediction_submitted.ipynb' RFE_FeatsToKeep = 16 FeatSelection_RFE=False FeatSelection_RFECV=False if (FeatSelection_RFE or FeatSelection_RFECV) == True: 'RFE + - best feats' 'http://scikit-learn.org/stable/auto_examples/plot_rfe_with_cross_validation.html ' svc = LinearSVC(class_weight='auto')#,penalty='l1',dual=False) # svc = LogisticRegression(class_weight='auto')#,C=1) if FeatSelection_RFECV==True: rfecv = RFECV(estimator=svc, step=RFE_FeatsToKeep,scoring='average_precision') # ,cv=StratifiedShuffleSplit(y,n_iter=3,test_size=0.3)) #,scoring='f1',verbose=0) # " scoring='roc_auc','recall','f1',accuracy..." else: rfecv = RFE(estimator=svc,n_features_to_select=RFE_FeatsToKeep, step=0.03) rfecv.fit(X, y) if FeatSelection_RFECV==True: print("RFE-CV selected %d features : " % (rfecv.n_features_)) print("RFE (%d features) scorer : " % (rfecv.n_features_),rfecv.score(X, y) ) rfe_featnames = featureNames[rfecv.get_support()] featureNames = featureNames[rfecv.get_support()] print("RFE selected feature names:",rfe_featnames) X_RFE = rfecv.fit_transform(X, y) print("X_RFE",X_RFE.shape) resDict['TopRFE-Features'][fileName]=str(rfe_featnames) 'Set GetRFEPerf To true or by user, if perf. of reduced set wanted' GetRFEPerf=False # print("lb_encoder.classes_",lb_encoder.classes_) 'Blind score boxplot graphic example using Seaborn: http://nbviewer.ipython.org/github/cs109/2014/blob/master/homework-solutions/HW5-solutions.ipynb ' 'Confusion matrixes + Dummies - http://bugra.github.io/work/notes/2014-11-22/an-introduction-to-supervised-learning-scikit-learn/' 'http://scikit-learn.org/stable/modules/model_evaluation.html#dummy-estimators' "http://blog.yhathq.com/posts/predicting-customer-churn-with-sklearn.html" print() "Make custom F1 scorer. May not have fixed problem!" from sklearn.metrics.score import make_scorer f1_scorer = make_scorer(metrics.f1_score, greater_is_better=True, average="micro") #Maybe another metric? May NOT be fixed!?. #weighted, micro, macro, none # print("Dummy classifiers output:") dummy_frequent = DummyClassifier(strategy='most_frequent',random_state=0) y_dummyPred = Get_yPred(X,y,clf_class=dummy_frequent) dummy_freq_acc = '{:.3}'.format(metrics.accuracy_score(y,y_dummyPred )) dummy_freq_f1 = '{:.3}'.format(metrics.f1_score(y, y_dummyPred,average='weighted')) dummy_freq_f1_weighted = '{:.3}'.format(f1_scorer(y, y_dummyPred)) #Get from ALL classes f1.. dummy_freq_f1_mean=(metrics.f1_score(y, y_dummyPred,average=None)).mean() # print("Dummy, most frequent acc:",dummy_freq_acc) # dummy_stratifiedRandom = DummyClassifier(strategy='stratified',random_state=0) # dummy_strat2= '{:.3%}'.format(metrics.accuracy_score(y, Get_yPred(X,y,clf_class=dummy_frequent))) #,sample_weight=balance_weights(y))) # 'print("Dummy, Stratified Random:",dummy_strat2)' print() resDict['dummy_freq:Accuracy'][fileName]=dummy_freq_acc ## resDict['dummy_freq:f1'][fileName]=dummy_freq_f1 dummy_freq_f1_mean resDict['dummy_freq:f1'][fileName]=dummy_freq_f1_mean resDict['dummy_freq_f1_weighted'][fileName]=dummy_freq_f1_weighted # resDict.dummy_Stratfreq[fileName]=dummy_strat2 "We can get seperately the best model for Acc, and the best for f1!" "WARNING!? In binary case - default F1 works for the 1 class, in sklearn 15. and lower" # bestEst_f1,bestScore_f1 = ModelParam_GridSearch(X,y,cv=3,scoreParam = 'f1') "Temporary workaround until next SKlearn update of F1 metric:" # bestEst_f1,bestScore_f1 = ModelParam_GridSearch(X,y,cv=3,scoreParam = 'f1')f1_scorer bestEst_f1,bestScore_f1 = ModelParam_GridSearch(X,y,cv=3,scoreParam = f1_scorer) bestEst_acc,bestScore_acc = ModelParam_GridSearch(X,y,cv=2,scoreParam = 'accuracy') print("bestEst (f1):",bestEst_f1)#,"best f1",bestScore_f1) print("bestEst (f1):",bestEst_acc)#,"best acc",bestScore_acc) #Temp # bestEst_f1=bestEst_acc=bestEst = RandomForestClassifier(n_jobs=-1) if GetRFEPerf==True: bestEst_RFE,bestScore_RFE = ModelParam_GridSearch(X_RFE,y,cv=3,scoreParam = 'f1') "Modified to get 2 estimators" scores_acc = cross_val_score(estimator=bestEst_acc, X=X, y=y, cv=StratifiedShuffleSplit(y, n_iter=13, test_size=0.18), n_jobs=-1) #Accuracy print("Accuracy: %0.3f (+- %0.2f)" % (scores_acc.mean(), scores_acc.std() * 2)) scores_f1 = cross_val_score(estimator=bestEst_f1, X=X, y=y, cv=StratifiedShuffleSplit(y, n_iter=13, test_size=0.18), n_jobs=-1, scoring='f1') print("f1: %0.3f (+- %0.2f)" % (scores_f1.mean(), scores_f1.std() * 2)) resDict['Accuracy'][fileName]=round(scores_acc.mean(),4) resDict['Accuracy_SD'][fileName]=round(scores_acc.std(),4) resDict['f1'][fileName]=round(scores_f1.mean(),4) resDict['f1_SD'][fileName]=round(scores_f1.std(),4) resDict['Array-f1-Scores'][fileName]=(scores_f1) resDict['Array-Acc-Scores'][fileName]=(scores_acc) resDict['bestML-f1'][fileName]=(str(bestEst_f1)) resDict['bestML-Acc'][fileName]=(str(bestEst_acc)) #ORIG # Acc,Acc_SD,f1,f1_SD = CV_multi_stats(X, y, bestEst,n=15) # resDict['Accuracy'][fileName]=round(Acc,4) # resDict['Accuracy_SD'][fileName]=round(Acc_SD,4) # resDict['f1 score'][fileName]=round(f1,4) # resDict['f1_SD'][fileName]=round(f1_SD,4) # resDict['Best (f1) Model parameters'][fileName]= bestEst print() # print(fileName," Done") print("Saving results to file") resDict.to_csv("OutputData.tsv", sep=',')
# vnum = len(allvector) # allvector = npy.array(allvector).T # for index in range(0,len(TestData)): # vector = list(allvector[index]) # p = p*vector.count(TestData[index])/vnum # lbDict[thislb] = p*alllabel # thislabel = sorted(lbDict,key = lambda x:lbDict[x],reversed=True)[0] # return thislabel # # # by1 = Bayes() # # by1.fit() import pandas as pda from sklearn.linear_model import LogisticRegression as LR from sklearn.linear_model import RandomizedLogisticRegression as RLR fname = '' dataf = pda.read_csv(fname) x = dataf.iloc[:, 1:4].as_matrix y = dataf.iloc[:, 0:1].as_matrix r1 = RLR() r1.fit(x, y) r1.get_support() #特征筛选 # print(dataf.columns[r1.get_support()]) t = dataf[dataf.columns[r1.get_support()]].as_matrix() r2 = LR() r2.fit(t, y) print('训练结束') print('模型正确率为' + str(r2.score(x, y))) import matplotlib
filename = r'D:\DataAnalysis\Python_practice\chapter5\demo\data\bankloan.xls' data = pd.read_excel(filename) x = data.iloc[:, : 8] #.as_matrix() #选取自变量,在书中将df转化为矩阵(.as_matrix)进行运算。而本程序使用的参数可以为dataframe,故可以不转化 y = data.iloc[:, 8] #.as_matrix() #loc是根据条件选取,iloc是根据索引进行选取切片 from sklearn.linear_model import LogisticRegression as LR from sklearn.linear_model import RandomizedLogisticRegression as RLR rlr = RLR() # 建立随机逻辑回归模型,筛选变量 # 可以使用参数设置阈值: selection_threshold = 0.5 ,默认0.25(即得分<0.25的特征值会被剔除) rlr.fit(x, y) #训练模型 rlr.get_support() # 获取特征筛选结果,也可以通过 .scores_方法获取各个特征的分数 filter_columns = data.columns[0:8][rlr.get_support()] #选取特征字段数据 print(u'---------start-----------') print(u'有效特征为: %s' % ','.join(filter_columns)) x = data[filter_columns] #.as_matrix(columns=None) lr = LR() # 建立逻辑回归模型 lr.fit(x, y) # 用筛选后的特征数据来训练模型 predictions = lr.predict(x) #对数据进行预测,将训练模型运用于数据集x data['预测值'] = [int(np.round(x)) for x in predictions] print(u'---------end-----------') print(u'模型的平均正确率为%s' % lr.score(x, y))
def hyperparameterSearch(training_set_path, cat, rl, bu): print("Importing descriptors from the training set.") X, y, labels = import_descriptors( training_set_path, "*_%s_%s_train_descriptors_N20.txt" % (rl, bu)) print("Number of features: %d." % X.shape[-1]) print("Scaling data.") min_max_scaler = MinMaxScaler() X_scale = min_max_scaler.fit_transform(X.todense()) print("Performing feature selection with randomized logistic regression.") # set n_jobs=-1 to parallelize the Randomized Logistic Regression # however, there is a bug in the current version of skitlearn (0.18.1) which results in the following message: # ValueError: assignment destination is read-only, when parallelizing with n_jobs > 1 feature_selector = RandomizedLogisticRegression(n_jobs=1) X_scale = feature_selector.fit_transform(X_scale, y) print("Reduced number of features: %d." % X_scale.shape[-1]) print( "Running randomized hyper-parameter search with Leave-One-Out validation for the RBF kernel." ) param_dist_rbf = { 'kernel': ['rbf'], 'C': expon(scale=2000), 'gamma': expon(scale=.01) } random_sv_rbf = RandomizedSearchCV(SVC(), param_distributions=param_dist_rbf, n_iter=100, scoring='f1', cv=LeaveOneGroupOut(), n_jobs=-1, error_score=0, iid=False, refit=False) random_sv_rbf.fit(X_scale, y, groups=labels) print( "Running randomized hyper-parameter search with Leave-One-Out validation for the linear kernel." ) param_dist_linear = {'C': expon(scale=2000)} random_sv_linear = RandomizedSearchCV( LinearSVC(), param_distributions=param_dist_linear, n_iter=100, scoring='f1', cv=LeaveOneGroupOut(), n_jobs=-1, error_score=0, iid=False, refit=False) random_sv_linear.fit(X_scale, y, groups=labels) print( "Running randomized hyper-parameter search with Leave-One-Out validation for the polynomial kernel." ) param_dist_poly = { 'kernel': ['poly'], 'C': expon(scale=2000), 'degree': randint(2, 11), 'coef0': uniform(loc=-2, scale=4), 'gamma': expon(scale=.01) } random_sv_poly = RandomizedSearchCV(SVC(), param_distributions=param_dist_poly, n_iter=100, scoring='f1', cv=LeaveOneGroupOut(), n_jobs=-1, error_score=0, iid=False, refit=False) random_sv_poly.fit(X_scale, y, groups=labels) print( "Running randomized hyper-parameter search with Leave-One-Out validation for the sigmoid kernel." ) param_dist_sigmoid = { 'kernel': ['sigmoid'], 'C': expon(scale=2000), 'coef0': uniform(loc=-2, scale=4), 'gamma': expon(scale=.01) } random_sv_sigmoid = RandomizedSearchCV( SVC(), param_distributions=param_dist_sigmoid, n_iter=100, scoring='f1', cv=LeaveOneGroupOut(), n_jobs=-1, error_score=0, iid=False, refit=False) random_sv_sigmoid.fit(X_scale, y, groups=labels) with open( "%sbest_parameters_test_%s_%s_%s.txt" % (training_set_path, cat, rl, bu), "w") as best_params: extracted_features = [ "%d" % (x + 1) for x in feature_selector.get_support(indices=True) ] print( "Best parameters found on training set with the RBF kernel:\n%s %s" % (random_sv_rbf.best_params_, random_sv_rbf.best_score_)) best_params.write( "Best parameters found on training set with the RBF kernel:\n%s %s\n" % (random_sv_rbf.best_params_, random_sv_rbf.best_score_)) print("kernel[(\"%s\", \"%s\", \"%s\")] = \"%s\"" % (cat, rl, bu, random_sv_rbf.best_params_["kernel"])) best_params.write("\nkernel[(\"%s\", \"%s\", \"%s\")] = \"%s\"\n" % (cat, rl, bu, random_sv_rbf.best_params_["kernel"])) print("C[(\"%s\", \"%s\", \"%s\")] = %f" % (cat, rl, bu, random_sv_rbf.best_params_["C"])) best_params.write("C[(\"%s\", \"%s\", \"%s\")] = %f\n" % (cat, rl, bu, random_sv_rbf.best_params_["C"])) print("gamma[(\"%s\", \"%s\", \"%s\")] = %f" % (cat, rl, bu, random_sv_rbf.best_params_["gamma"])) best_params.write("gamma[(\"%s\", \"%s\", \"%s\")] = %f\n" % (cat, rl, bu, random_sv_rbf.best_params_["gamma"])) print("features[(\"%s\", \"%s\", \"%s\")] = [%s]\n" % (cat, rl, bu, ", ".join(extracted_features))) best_params.write("features[(\"%s\", \"%s\", \"%s\")] = [%s]\n" % (cat, rl, bu, ", ".join(extracted_features))) print("Random LOOCV scores on development set:") best_params.write("Random LOOCV scores on development set:\n") means = random_sv_rbf.cv_results_['mean_test_score'] stds = random_sv_rbf.cv_results_['std_test_score'] for mean, std, params in zip(means, stds, random_sv_rbf.cv_results_['params']): print("%0.5f (stdev %0.5f) for %r" % (mean, std, params)) best_params.write("%0.5f (stdev %0.5f) for %r\n" % (mean, std, params)) print( "Best parameters found on training set with the linear kernel:\n%s %s" % (random_sv_linear.best_params_, random_sv_linear.best_score_)) best_params.write( "Best parameters found on training set with the linear kernel:\n%s %s\n" % (random_sv_linear.best_params_, random_sv_linear.best_score_)) print("kernel[(\"%s\", \"%s\", \"%s\")] = \"%s\"" % (cat, rl, bu, 'linear')) best_params.write("\nkernel[(\"%s\", \"%s\", \"%s\")] = \"%s\"\n" % (cat, rl, bu, 'linear')) print("C[(\"%s\", \"%s\", \"%s\")] = %f" % (cat, rl, bu, random_sv_linear.best_params_["C"])) best_params.write("C[(\"%s\", \"%s\", \"%s\")] = %f\n" % (cat, rl, bu, random_sv_linear.best_params_["C"])) print("features[(\"%s\", \"%s\", \"%s\")] = [%s]\n" % (cat, rl, bu, ", ".join(extracted_features))) best_params.write("features[(\"%s\", \"%s\", \"%s\")] = [%s]\n" % (cat, rl, bu, ", ".join(extracted_features))) print("Random LOOCV scores on development set:") best_params.write("Random LOOCV scores on development set:\n") means = random_sv_linear.cv_results_['mean_test_score'] stds = random_sv_linear.cv_results_['std_test_score'] for mean, std, params in zip(means, stds, random_sv_linear.cv_results_['params']): print("%0.5f (stdev %0.5f) for %r" % (mean, std, params)) best_params.write("%0.5f (stdev %0.5f) for %r\n" % (mean, std, params)) print( "Best parameters found on training set with the polynomial kernel:\n%s %s" % (random_sv_poly.best_params_, random_sv_poly.best_score_)) best_params.write( "Best parameters found on training set with the polynomial kernel:\n%s %s\n" % (random_sv_poly.best_params_, random_sv_poly.best_score_)) print("kernel[(\"%s\", \"%s\", \"%s\")] = \"%s\"" % (cat, rl, bu, random_sv_poly.best_params_["kernel"])) best_params.write("\nkernel[(\"%s\", \"%s\", \"%s\")] = \"%s\"\n" % (cat, rl, bu, random_sv_poly.best_params_["kernel"])) print("C[(\"%s\", \"%s\", \"%s\")] = %f" % (cat, rl, bu, random_sv_poly.best_params_["C"])) best_params.write("C[(\"%s\", \"%s\", \"%s\")] = %f\n" % (cat, rl, bu, random_sv_poly.best_params_["C"])) print("gamma[(\"%s\", \"%s\", \"%s\")] = %f" % (cat, rl, bu, random_sv_poly.best_params_["gamma"])) best_params.write("gamma[(\"%s\", \"%s\", \"%s\")] = %f\n" % (cat, rl, bu, random_sv_poly.best_params_["gamma"])) print("degree[(\"%s\", \"%s\", \"%s\")] = %d" % (cat, rl, bu, random_sv_poly.best_params_["degree"])) best_params.write("degree[(\"%s\", \"%s\", \"%s\")] = %d\n" % (cat, rl, bu, random_sv_poly.best_params_["degree"])) print("coef0[(\"%s\", \"%s\", \"%s\")] = %f" % (cat, rl, bu, random_sv_poly.best_params_["coef0"])) best_params.write("coef0[(\"%s\", \"%s\", \"%s\")] = %f\n" % (cat, rl, bu, random_sv_poly.best_params_["coef0"])) print("features[(\"%s\", \"%s\", \"%s\")] = [%s]\n" % (cat, rl, bu, ", ".join(extracted_features))) best_params.write("features[(\"%s\", \"%s\", \"%s\")] = [%s]\n" % (cat, rl, bu, ", ".join(extracted_features))) print("Random LOOCV scores on development set:") best_params.write("Random LOOCV scores on development set:\n") means = random_sv_poly.cv_results_['mean_test_score'] stds = random_sv_poly.cv_results_['std_test_score'] for mean, std, params in zip(means, stds, random_sv_poly.cv_results_['params']): print("%0.5f (stdev %0.5f) for %r" % (mean, std, params)) best_params.write("%0.5f (stdev %0.5f) for %r\n" % (mean, std, params)) print( "Best parameters found on training set with the sigmoid kernel:\n%s %s" % (random_sv_sigmoid.best_params_, random_sv_sigmoid.best_score_)) best_params.write( "Best parameters found on training set with the sigmoid kernel:\n%s %s\n" % (random_sv_sigmoid.best_params_, random_sv_sigmoid.best_score_)) print("kernel[(\"%s\", \"%s\", \"%s\")] = \"%s\"" % (cat, rl, bu, random_sv_sigmoid.best_params_["kernel"])) best_params.write( "\nkernel[(\"%s\", \"%s\", \"%s\")] = \"%s\"\n" % (cat, rl, bu, random_sv_sigmoid.best_params_["kernel"])) print("C[(\"%s\", \"%s\", \"%s\")] = %f" % (cat, rl, bu, random_sv_sigmoid.best_params_["C"])) best_params.write("C[(\"%s\", \"%s\", \"%s\")] = %f\n" % (cat, rl, bu, random_sv_sigmoid.best_params_["C"])) print("gamma[(\"%s\", \"%s\", \"%s\")] = %f" % (cat, rl, bu, random_sv_sigmoid.best_params_["gamma"])) best_params.write( "gamma[(\"%s\", \"%s\", \"%s\")] = %f\n" % (cat, rl, bu, random_sv_sigmoid.best_params_["gamma"])) print("coef0[(\"%s\", \"%s\", \"%s\")] = %f" % (cat, rl, bu, random_sv_sigmoid.best_params_["coef0"])) best_params.write( "coef0[(\"%s\", \"%s\", \"%s\")] = %f\n" % (cat, rl, bu, random_sv_sigmoid.best_params_["coef0"])) print("features[(\"%s\", \"%s\", \"%s\")] = [%s]\n" % (cat, rl, bu, ", ".join(extracted_features))) best_params.write("features[(\"%s\", \"%s\", \"%s\")] = [%s]\n" % (cat, rl, bu, ", ".join(extracted_features))) print("Random LOOCV scores on development set:") best_params.write("Random LOOCV scores on development set:\n") means = random_sv_sigmoid.cv_results_['mean_test_score'] stds = random_sv_sigmoid.cv_results_['std_test_score'] for mean, std, params in zip(means, stds, random_sv_sigmoid.cv_results_['params']): print("%0.5f (stdev %0.5f) for %r" % (mean, std, params)) best_params.write("%0.5f (stdev %0.5f) for %r\n" % (mean, std, params))
# 代码清单5-1 逻辑回归代码 import pandas as pd # 参数初始化 fileName = 'data/bankloan.xls' data = pd.read_excel(fileName) x = data.iloc[:,:8].as_matrix() y = data.iloc[:,8].as_matrix() # 逻辑回归模型 from sklearn.linear_model import LogisticRegression as LR # 随机逻辑回归模型 from sklearn.linear_model import RandomizedLogisticRegression as RLR # 建立随机逻辑回归模型,筛选变量 rlr = RLR() # 训练模型 rlr.fit(x,y) # 获取特筛选结果,也可以通过.score_方法获取各个特征的分数 rlr.get_support() print(u'通过随机逻辑回归模型筛选特征结束。') print(u'有效特征为 %s' %'.'.join(data.columns[rlr.get_support()])) # 筛选好特征 x = data[data.columns[rlr.get_support()]].as_matrix() # 建立逻辑回归模型 lr = LR() # 用筛选后的特征数据来训练模型 lr.fit(x,y) print(u'逻辑回归模型训练结束。') # 给出模型的平均正确率,本例为81.48 print(u'模型的平均正确率为 %s' %lr.score(x,y))
# 'Normalize/Scale features if needed. Our data is standardized by default' # X = StandardScaler(copy=False).fit_transform(X) Fwe = SelectFwe(alpha=0.01).fit(X,y) X=Fwe.transform(X) featureNames=featureNames[Fwe.get_support()] print("F-test filter ->",X.shape) FeatSelection_SVM=True FeatSelection_RandLogReg=False if FeatSelection_RandLogReg == True: LogRegFeats = RandomizedLogisticRegression(C=5, scaling=0.5, sample_fraction=0.8, n_resampling=60, selection_threshold=0.2,n_jobs=-1) X = LogRegFeats.fit_transform(X,y) featureNames=featureNames[LogRegFeats.get_support()] print("RandomizedLogisticRegression Feature Selection ->:",X.shape) elif FeatSelection_SVM == True: X= LinearSVC(C=1, penalty="l1", dual=False,class_weight='auto').fit_transform(X, y) # X= LogisticRegression(C=0.01,class_weight='auto').fit_transform(X, y) featureNames=featureNames[LogRegFeats.get_support()] print ("SVC Transformed X:",X.shape) ''' print("Plot #Feats vs Classification performance:") PlotPerfPercentFeatures(X_LR,y,est=SVC(C=100)) ''' KFilt=None # KFilt=200
import pandas as pd filename = 'bankloan.xls' data = pd.read_excel(filename) x = data.iloc[:, :8].as_matrix() y = data.iloc[:, 8].as_matrix() from sklearn.linear_model import LogisticRegression as LR from sklearn.linear_model import RandomizedLogisticRegression as RLR rlr = RLR() rlr.fit(x, y) rlr.fit(x, y) rlr.get_support() print("end search useful_data") print(u'end search useful data: %s' % ''.join(data.columns[rlr.get_support()])) x = data[data.columns[rlr.get_support()]].as_matrix() lr = LR() lr.fit(x, y) print() print('%s' % lr.score(x, y))
# -*- coding:utf-8 -*- # 逻辑回归:自动建模 import pandas as pd from sklearn.linear_model import LogisticRegression as LR from sklearn.linear_model import RandomizedLogisticRegression as RLR data = pd.read_excel("c://mldata//bankloan.xls", header=0) # x = data.iloc[:, :8].as_matrix() # y = data.iloc[:, 8].as_matrix() 和下边的两种读取数据的方式,都会带来精度的影响 train_data = data.values # 将读取的数据其转换为矩阵形式 train_x = train_data[0::, :8] train_label = train_data[0::, 8] rlr = RLR() # 建立随机回归模型,筛选变量 rlr.fit(train_x, train_label) # 训练模型 rlr.get_support() # 获取特征筛选结果 print u"特征筛选结束" print u"有效特征为:%s" % u'、'.join(data.columns[rlr.get_support()]) x = data[data.columns[rlr.get_support()]].as_matrix() # 筛选好的特征 lr = LR() lr.fit(x, train_label) # 用筛选好的特征数据来训练模型 print u'逻辑回归训练结束' print u'模型的平均正确率为:%s' % lr.score(x, train_label)
def pick_variables(x,y,descover=True,method="rlr",threshold=0.25,sls=0.05):#默认阈值0.25 #挑选变量助手 if method == "rlr": #随机逻辑回归选择与y线性关系的变量(稳定性选择1)。 #在不同数据子集和特征子集上运行特征选择算法(rlr),最终汇总选择结果 rlr = RandomizedLogisticRegression(selection_threshold=threshold) rlr.fit(x,y) scoretable = pd.DataFrame(rlr.all_scores_,index = x.columns,columns = ['var_score']) columns_need = list(x.columns[rlr.get_support()]) x = x[columns_need] #向后淘汰 if method =="bs" and x.shape[1] > 1: #提取X,y变量名 data = pd.concat([x, y], axis=1)#合并数据 var_list = x.columns response = y.name #首先对所有变量进行模型拟合 while True: formula = "{} ~ {} + 1".format(response, ' + '.join(var_list)) mod = smf.logit(formula, data).fit() print(mod.summary2()) p_list = mod.pvalues.sort_values() if p_list[-1] > sls: #提取p_list中最后一个index var = p_list.index[-1] #var_list中删除 var_list = var_list.drop(var) else: break x=x[var_list] #向前选择 if method =="fs": data = pd.concat([x, y], axis=1) response=y.name remaining = set(x.columns) selected = [] current_score, best_new_score = 0.0, 0.0 while remaining and current_score == best_new_score: scores_with_candidates = [] for candidate in remaining: formula = "{} ~ {} + 1".format(response, ' + '.join(selected + [candidate])) mod = smf.logit(formula, data).fit() score = mod.prsquared scores_with_candidates.append((score, candidate)) scores_with_candidates.sort(reverse=False) best_new_score, best_candidate = scores_with_candidates.pop() if current_score < best_new_score: remaining.remove(best_candidate) selected.append(best_candidate) current_score = best_new_score print(len(selected)) x=x[selected] #rsquared_adj prsquared if method =="fs_bs": data = pd.concat([x, y], axis=1) response=y.name remaining = set(x.columns) selected = [] current_score, best_new_score = 0.0, 0.0 while remaining and current_score == best_new_score: scores_with_candidates = [] for candidate in remaining: formula = "{} ~ {} + 1".format(response, ' + '.join(selected + [candidate])) mod = smf.logit(formula, data).fit() score = mod.prsquared scores_with_candidates.append((score, candidate)) scores_with_candidates.sort(reverse=False) best_new_score, best_candidate = scores_with_candidates.pop() if current_score < best_new_score: print("===========================") remaining.remove(best_candidate) selected.append(best_candidate) current_score = best_new_score formula2= "{} ~ {} + 1".format(response, ' + '.join(selected)) mod2 = smf.logit(formula2,data).fit() p_list = mod2.pvalues.sort_values() if p_list[-1] > sls: #提取p_list中最后一个index var = p_list.index[-1] #var_list中删除 selected.remove(var) print(p_list[-1]) formula3= "{} ~ {} + 1".format(response, ' + '.join(selected)) mod3 = smf.logit(formula3, data).fit() best_new_score = mod3.prsquared current_score = best_new_score print(len(selected)) x=x[selected] ''' 注意这里调用的是statsmodels.api里的逻辑回归。这个回归模型可以获取每个变量的显著性p值,p值越大越不显著,当我们发现多于一个变量不显著时, 不能一次性剔除所有的不显著变量,因为里面可能存在我们还未发现的多变量的多重共线性,我们需要迭代的每次剔除最不显著的那个变量。 上面迭代的终止条件: ①剔除了所有的不显著变量 ②剔除了某一个或某几个变量后,剩余的不显著变量变得显著了。(说明之前存在多重共线性) ''' if method =="rfc": RFC = RandomForestClassifier(n_estimators=200,max_depth=5,class_weight="balanced") RFC_Model = RFC.fit(x,y) features_rfc = x.columns featureImportance = {features_rfc[i]:RFC_Model.feature_importances_[i] for i in range(len(features_rfc))} featureImportanceSorted = sorted(featureImportance.items(),key=lambda x: x[1], reverse=True) features_selection = [k[0] for k in featureImportanceSorted[:15]] x = x[features_selection] x['intercept'] = [1]*x.shape[0] LR = sm.Logit(y, x).fit() summary = LR.summary() print(summary) x=x.drop("intercept",axis=1) return x
#-*- coding:utf-8 -*- import pandas as pd #参数初始化 filename = 'E:\\3data-mining\\2py-testing\\data and code\\chapter5\\demo\\data\\bankloan.xls' data = pd.read_excel(filename) x = data.iloc[:, :8].as_matrix() y = data.iloc[:, 8].as_matrix() #从sklearn包中导入逻辑回归模型 from sklearn.linear_model import LogisticRegression as LR from sklearn.linear_model import RandomizedLogisticRegression as RLR rlr = RLR() #建立随机逻辑回归模型,筛选变量 rlr.fit(x, y) #训练模型 rlr.get_support() #获取特征筛选结果 print(u'通过随机逻辑回归模型筛选特征结果') print(u'有效特征为:%s' % ','.join(data.columns[rlr.get_support(indices=True)])) x = data[data.columns[rlr.get_support(indices=True)]].as_matrix() #筛选好特征 lr = LR() #建立逻辑货柜模型 lr.fit(x, y) #用筛选后的特征数据来训练模型 print(u'逻辑回归模型训练结束') print(u'模型的平均正确率:%s' % lr.score(x, y)) #给出模型的正确率
#clf = svm.SVC(kernel='linear') #rfecv = RFECV(estimator=clf, step=1, cv=StratifiedKFold(5), # scoring='accuracy') #rfecv.fit(train_data, emot) ##x_label = range(1, len(rfecv.grid_scores_) + 1) ##y_label = rfecv.grid_scores_ ##有效特征标签 #support=rfecv.support_ ##获取有效特征数据 #train_data=rfecv.transform(train_data) #特征筛选,使用RLR from sklearn.linear_model import RandomizedLogisticRegression as RLR rlr = RLR() rlr.fit(train_data, probs) rlr.get_support() #准备回归分类器 import sklearn from sklearn import gaussian_process, kernel_ridge, isotonic from sklearn.ensemble import ExtraTreesClassifier Regressors = { # 'pls':cross_decomposition.PLSRegression(),报错 'gradient boosting': ensemble.GradientBoostingRegressor(), # 'gaussian':gaussian_process.GaussianProcessRegressor(),报错 # 'isotonic':isotonic.IsotonicRegression(),报错 'kernelridge': kernel_ridge.KernelRidge(), 'ARD': linear_model.ARDRegression(), 'bayesianridge': linear_model.BayesianRidge(), # 'elasticnet':linear_model.ElasticNet(),#报错 'HuberRegressor': linear_model.HuberRegressor(),
#-*- coding: utf-8 -*- #逻辑回归 自动建模 import pandas as pd #参数初始化 filename = '../data/bankloan.xls' data = pd.read_excel(filename) x = data.iloc[:,:8].as_matrix() y = data.iloc[:,8].as_matrix() from sklearn.linear_model import LogisticRegression as LR from sklearn.linear_model import RandomizedLogisticRegression as RLR rlr = RLR() #建立随机逻辑回归模型,筛选变量 rlr.fit(x, y) #训练模型 rlr.get_support() #获取特征筛选结果,也可以通过.scores_方法获取各个特征的分数 print(u'通过随机逻辑回归模型筛选特征结束。') print(u'有效特征为:%s' % ','.join(data.columns[rlr.get_support()])) x = data[data.columns[rlr.get_support()]].as_matrix() #筛选好特征 lr = LR() #建立逻辑货柜模型 lr.fit(x, y) #用筛选后的特征数据来训练模型 print(u'逻辑回归模型训练结束。') print(u'模型的平均正确率为:%s' % lr.score(x, y)) #给出模型的平均正确率,本例为81.4%
import pandas as pd from sklearn.linear_model import RandomizedLogisticRegression as RLR from sklearn.linear_model import LogisticRegression as LR #读取数据 data = pd.read_csv("C:/Users/T/Desktop/python视频/luqu.csv") x = data.iloc[:, 1:4].as_matrix() y = data.iloc[:, :1].as_matrix() #随机Logistic模型,用于筛选变量 f1 = RLR() f1.fit(x, y) f1.get_support() #筛选出的变量 #Logistic模型 f2 = LR() f2.fit(x, y) f2.score(x, y) #准确率
'''第五章''' '''Logistic回归''' import pandas as pd filename = 'C:/Users/Administrator/Desktop/chapter5/demo/data/bankloan.xls' data = pd.read_excel(filename) x = data.iloc[:,:8].as_matrix() y = data.iloc[:,8].as_matrix() x.shape y.shape from sklearn.linear_model import LogisticRegression as LR from sklearn.linear_model import RandomizedLogisticRegression as RLR rlr = RLR() rlr.fit(x,y) rlr.get_support() #rlr.scores_ 特征分数 print('通过随机逻辑回归模型筛选特征结束。') print('有效特征为:%s' % ','.join(data.columns[rlr.get_support()])) x = data[data.columns[rlr.get_support()]].as_matrix() lr = LR() lr.fit(x,y) print('逻辑回归模型训练结束') print('模型的平均正确率为:%s' % lr.score(x,y)) '''K-Means聚类''' import pandas as pd #参数初始化 inputfile = 'C:/Users/Administrator/Desktop/chapter5/demo/data/consumption_data.xls' #销量及其他属性数据
import pandas as pd from sklearn.linear_model import LogisticRegression as LR from sklearn.linear_model import RandomizedLogisticRegression as RLR lessonPath = 'E:\\BaiduNetdiskDownload\\sourceCode\\week8\\lesson2.csv' luquPath = 'E:\\BaiduNetdiskDownload\\sourceCode\\week8\\luqu2.csv' dataLuqu = pd.read_csv(luquPath) # 特征 x = dataLuqu.iloc[:, 1:4].as_matrix() y = dataLuqu.iloc[:, 0:1].as_matrix() r1 = RLR() r1.fit(x, y) # 特征筛选 r1.get_support() t = dataLuqu.columns[r1.get_support()].as_matrix() r2 = LR() r2.fit(t, y) print('训练结束') print('模型正确率: ' + str(r2.score(x, y)))
# 这里会报经过方法在后面版本已经被修改,但是我还没有好的合适的新方法 from sklearn.linear_model import LogisticRegression as LR from sklearn.linear_model import RandomizedLogisticRegression as RLR # 逻辑回归,自动建模 fileName = './bankloan.xls' data = pd.read_excel(fileName) # 取前8列 x = data.iloc[:, :8].values # 取最后一列 y = data.iloc[:, 8].values print(x) print(y) # 建立随机逻辑回归模型,筛选变量 rlr = RLR() # 训练模型 rlr.fit(x, y) # 获取特征筛选结果 rlr.get_support(indices=True) print(rlr.get_support(indices=True)) print("通过随机逻辑回归模型筛选特征结果") print('有效特征为: %s' % ','.join(data.columns[rlr.get_support(indices=True)])) # 筛选好特征 x = data[data.columns[rlr.get_support(indices=True)]].values # 建立逻辑回归模型 lr = LR(solver='liblinear') # 训练模型 lr.fit(x, y) print("逻辑回归模型训练结束") print('平均准确率为: %s' % lr.score(x, y))