def get_reg_params(self, X, y): if self.Cs is None: self.Cs = l1_min_c(X, y, loss='log') * np.logspace(0, 7, self.n_C) ret = list() for c in self.Cs: ret.append(dict(C=c)) return ret
def compute_coefs(): X, y, tX, ty = load_labled_point() X -= np.mean(X,0) cs = l1_min_c(X, y, loss='log') * np.logspace(0, 3) start = datetime.now() #solver : {‘newton-cg’, ‘lbfgs’, ‘liblinear’, ‘sag’}, default: ‘liblinear’ clf = linear_model.LogisticRegression(C=1.0, penalty='l1', tol=1e-6) coefs_ = [] for c in cs: clf.set_params(C=c) clf.fit(X, y) coefs_.append(clf.coef_.ravel().copy()) print("This took ", datetime.now() - start) pred_y = clf.predict(tX) print('pred',' ','real') for i in range(pred_y.shape[0]): print(pred_y[i],' ',ty[i][0]) coefs_ = np.array(coefs_) plt.plot(np.log10(cs), coefs_) ymin, ymax = plt.ylim() plt.xlabel('log(C)') plt.ylabel('Coefficients') plt.title('Logistic Regression Path') plt.axis('tight') plt.show()
def lasso_selecting(train_feature, train_label, test_feature, test_label, alpha_base): cs = l1_min_c(train_feature, train_label, loss="log") * np.logspace(0, 3, 10) print("Computing regularization path ...") model_lasso = LogisticRegression( penalty="l1", solver="liblinear", tol=1e-6, max_iter=int(1e6), warm_start=True, intercept_scaling=10000.0, ) for c in tqdm(cs): model_lasso.set_params(C=c) model_lasso.fit(train_feature, train_label) coef = model_lasso.coef_.ravel().copy() non_zero_feature = [ (column, coef[i]) for i, column in enumerate(train_feature.columns) if coef[i] != 0.0 ] valid_pred = model_lasso.predict_proba(test_feature)[:, 1] valid_evaluation = evaluate(test_label, valid_pred) yield { "model": model_lasso, "log(C)": np.log10(c), "features": non_zero_feature, "score": valid_evaluation, "coef": coef, }
def test_l1_min_c(self): iris = datasets.load_iris() df = pdml.ModelFrame(iris) result = df.svm.l1_min_c() expected = svm.l1_min_c(iris.data, iris.target) self.assertAlmostEqual(result, expected)
def tune_C_regularization_path(clf, features, labels, ind2label, num_steps=16, num_seeds=50): # select the top <num_seeds> seed words for each aspect clf.set_params(warm_start=True) min_c = 1 # 10^0 max_c = 8 # 10^7 # Regularization path # Return the lowest bound for C such that for C in (l1_min_C, infinity) the model is guaranteed not to be empty. # cs: a list of possible c values to try cs = l1_min_c(features, labels, loss='log') * np.logspace(min_c, max_c, num_steps) print("Computing regularization path ...") start = time() coefs_ = [] for c in cs: clf.set_params(C=c) clf.fit(features, labels) coefs_.append(clf.coef_.copy()) pos = np.sum(clf.coef_ > 0, axis=1) if len(ind2label) == 2: # binary classification flags = [pos[0] > num_seeds] else: # multiclass classification flags = [pos[i] > num_seeds for i in ind2label] if not False in flags: print("This took %0.3fs" % (time() - start)) return c print("This took %0.3fs" % (time() - start)) return c
def test_L1_iris(): # 具有 L1-逻辑回归的路径 iris = datasets.load_iris() X = iris.data y = iris.target X = X[y != 2] y = y[y != 2] X -= np.mean(X, 0) cs = l1_min_c(X, y, loss='log') * np.logspace(0, 3) print("Computing regularization path ...") start = datetime.now() clf = linear_model.LogisticRegression(C=1.0, penalty='l1', tol=1e-6) coefs_ = [] for c in cs: clf.set_params(C=c) clf.fit(X, y) coefs_.append(clf.coef_.ravel().copy()) print("This took ", datetime.now() - start) coefs_ = np.array(coefs_) plt.plot(np.log10(cs), coefs_) # ymin, ymax = plt.ylim() plt.xlabel('log(C)') plt.ylabel('Coefficients') plt.title('Logistic Regression Path') plt.axis('tight') plt.show()
def test(): # 加载数据 # 3类数据去掉第2类 iris = datasets.load_iris() X = iris.data Y = iris.target X = X[Y != 2] Y = Y[Y != 2] # 减去均值,让相对差距更明显 X -= np.mean(X, 0) #创建数据空间 cs = l1_min_c(X, Y, loss='log') * np.logspace(0, 3) # 拟合数据 clf = linear_model.LogisticRegression(C=1.0, penalty='l1', tol=1e-6) # 拟合路径 coefs_ = [] for c in cs: clf.set_params(C=c) clf.fit(X, Y) coefs_.append(clf.coef_.ravel().copy()) coefs_ = np.array(coefs_) #print(coefs_) # 绘制路径 plt.plot(np.log10(cs), coefs_) ymin, ymax = plt.ylim() plt.xlabel('log(C)') plt.ylabel('Coefficients') plt.title('Logistic Regression Path') plt.axis('tight') plt.show()
def test_regularization_path(self): # Check results using logistic path num_samples = 10 num_feat = 5 X, y = make_classification(n_samples=num_samples, n_features=num_feat, n_informative=3, n_classes=2, random_state=0, weights=[0.5, 0.5]) matrix = np.zeros((num_samples, num_feat + 2)) matrix[:,:-2] = X matrix[:, -2] = np.ones(num_samples) matrix[:, -1] = y # Betas to test logitfitL1 = LogisticRegressionL1() lambda_grid = np.exp(-1 * np.linspace(1, 17, 200)) path = logitfitL1.fit(matrix, lambda_grid) # Sklearn cs = l1_min_c(X, y, loss='log') * np.logspace(0, 3) # Computing regularization path using sklearn clf = LogisticRegression(C=1.0, penalty='l1', tol=1e-6) coefs_ = [] for c in cs: clf.set_params(C=c) clf.fit(X, y) coefs_.append(clf.coef_.ravel().copy()) skbetas = np.append(clf.intercept_[0], clf.coef_) np.testing.assert_almost_equal(skbetas, logitfitL1.coef_, 1)
def get_minimum_c (self, fname, asset_class, cols): """Function to obtain the minimum c parameter. any value smaller than this yields a model with 0 coefficients. """ X_train, y_classifier = self.get_train_data(fname, asset_class, cols) #To determine the minimum C that gives a non 'null' model, only applicable when applying l1 penalty. min_c = l1_min_c(X_train[:-1], y_classifier[:-1], loss='log') return min_c
def get_minimum_c(self, cols, fname): """Function to obtain the minimum c parameter. any value smaller than this yields a model with 0 coefficients. """ training_xdata, training_ydata = self.obtain_training_data(cols, fname) #To determine the minimum C that gives a non 'null' model, only applicable when applying l1 penalty. min_c = l1_min_c(training_xdata, training_ydata, loss='log') return min_c
def l1_select(interpreted_model: bool, n_jobs: int, dataset: Tuple[pd.DataFrame, pd.Series], l1_base_step: int, l1_exp_step: float, early_stopping_rounds: Any, cv_split: Dict[int, Tuple[Sequence[int], Sequence[int]]], verbose=True, auc_tol: float = 1e-4) -> Tuple[f_list_type, Result]: # get grid for cs cs = l1_min_c(dataset[0], dataset[1], loss='log', fit_intercept=True) * np.logspace( 0, l1_exp_step, l1_base_step) print('C parameter range in [{0}:{1}], {2} values'.format( cs[0], cs[-1], l1_base_step)) # fit model with crossvalidation cv = PredefinedFolds(cv_split) clf = LogisticRegressionCV(Cs=cs, solver='saga', tol=1e-5, cv=cv, penalty='l1', scoring=scorer, intercept_scaling=10000., max_iter=1000, n_jobs=n_jobs, random_state=42) clf.fit(dataset[0].values, dataset[1].values) # print(clf.scores_[1].mean(axis=0)) # analyze cv results result = analyze_result(clf, dataset[0].columns, interpreted_model) # perform selection # filter bad weights models scores_neg = [x for x in result if x.is_neg] # get top score from avail models max_score = max([x.score for x in result]) # get score with tolerance ok_score = max_score - auc_tol # select first model that is ok with tolerance for res in scores_neg: if res.score >= ok_score: break # get selected features features_fit = [ x for (x, y) in zip(dataset[0].columns, res.min_weights) if y != 0 ] print(res) return features_fit, res
def refit_reg(x_train: np.ndarray, y: np.ndarray, l1_grid_size: int, l1_exp_scale: float, max_penalty: float, interp: bool = True) -> Tuple[np.ndarray, float, np.ndarray]: """ Final model refit with regularization Args: x_train: y: l1_grid_size: l1_exp_scale: max_penalty: interp: Returns: """ clf = LogisticRegression(penalty='l1', solver='saga', warm_start=True, intercept_scaling=100000) cs = l1_min_c(x_train, y, loss='log', fit_intercept=True) * np.logspace( 0, l1_exp_scale, l1_grid_size) cs = cs[cs <= max_penalty] # add final penalty if cs[-1] < max_penalty: cs = list(cs) cs.append(max_penalty) # fit path weights, intercepts = [], [] for c in cs: clf.set_params(C=c) clf.fit(x_train, y) weights.append(deepcopy(clf.coef_[0])) intercepts.append(clf.intercept_[0]) if not interp: w, i = weights[-1], intercepts[-1] neg = w != 0 return w[neg], i, neg for w, i in zip(weights[::-1], intercepts[::-1]): pos = (w > 0).sum() if pos > 0: continue neg = w < 0 return w[neg], i, neg raise ValueError('No negative weights grid')
def clfProcessor(name, pathForData, scoring_function): """ Loads the data, scales the data, defines a grid for the hyperparameters for a l1-regularized logistic regression classifier, performs L1-based feature selection, and finds the best hyperparameters via cross validation. :param name: Name of data extrcation procedure, eg freq_2, abs, etc :param pathForData: Path for directory where thr data is :param scoring_function: Scoring function to optimize, eg f1, precision, etc. :return clf: Object with the optimal classifier :return X: Scaled dataset :return y: Labels :return X_max: Array with maximal values of X, to reproduce transform of the traning set :return X_min: Array with minimal values of X, to reproduce transform of the traning set """ # Load training data. Each column is an observation fileMeasTrControl, fileMeasTrCA = getFileNames(name, pathForData) X, y = getXandY(fileMeasTrControl, fileMeasTrCA) # Scaling to [0,1] intervals and saving transform to apply in test data X_max = X.max(axis=0) X_min = X.min(axis=0) min_max_scaler = preprocessing.MinMaxScaler() X = min_max_scaler.fit_transform(X) # Shuffle data n_samples, n2 = X.shape order = np.random.permutation(n_samples) X = X[order, :] y = y[order].astype(np.float) # L1 based feature selection theTransform = linear_model.LogisticRegression(C=1, penalty='l1', dual=False, class_weight={1: 2}) # LinearSVC X = theTransform.fit_transform( X, y) # Find minimum C for non-empty model and get grid for cross validation cs_log = np.logspace( 0, 4, num=30) l1_min = l1_min_c( X, y, loss='log') cs = l1_min * cs_log[10:] grid = {'C': cs, 'class_weight': [{1: 1}, {1: 2}, {1: 3}, {1: 5}]} # Perform grid search cross validation kf = KFold( len(y), n_folds = 5) bestParameters = myCrossValidation( X, y, kf, scoring_function, grid) clf = linear_model.LogisticRegression( penalty = 'l1', **bestParameters) return [clf, X, y, X_max, X_min]
def refit_reg(X: np.ndarray, y: np.ndarray, l1_base_step: int, l1_exp_step: float, max_penalty: float, interp: bool = True) -> Tuple[np.ndarray, float, np.ndarray]: clf = LogisticRegression(penalty='l1', solver='saga', warm_start=True, intercept_scaling=100000) cs = l1_min_c(X, y, loss='log', fit_intercept=True) * np.logspace( 0, l1_exp_step, l1_base_step) cs = cs[cs <= max_penalty] # add final penalty if cs[-1] < max_penalty: cs = list(cs) cs.append(max_penalty) # fit path weights, intercepts = [], [] for c in cs: clf.set_params(C=c) clf.fit(X, y) weights.append(deepcopy(clf.coef_[0])) intercepts.append(clf.intercept_[0]) if not interp: w, i = weights[-1], intercepts[-1] neg = w != 0 return w[neg], i, neg for w, i in zip(weights[::-1], intercepts[::-1]): pos = (w > 0).sum() if pos > 0: continue neg = w < 0 return w[neg], i, neg # заглушка, если уж херня какая-то получилась - верни что есть # return w[neg], i, neg raise ValueError('No negative weights grid')
def logistic_regression(model_df, response, folds): logger = logging.getLogger('log') logger.info('dataset shape: {}'.format(model_df.shape)) response = response[model_df.index.intersection(response.index)] min_c = l1_min_c(model_df, response, loss='log') tuned_parameters = {'C': np.log10(np.logspace(min_c, min_c * 5000, 50))} clf = GridSearchCV(LogisticRegression(penalty='l1', random_state=100), tuned_parameters, cv=folds, scoring=('neg_log_loss', 'average_precision'), return_train_score=True, refit='average_precision') clf.fit(model_df, response) logger.info('CV average precision: {}'.format(clf.best_score_)) logger.info('best param: {}'.format(clf.best_params_)) # make sure that best index isn't on the edges of the grid logger.debug('best param index: {}'.format(clf.best_index_)) logger.debug('mean train score:\n{}'.format( clf.cv_results_['mean_train_average_precision'])) logger.debug('mean test score:\n{}'.format( clf.cv_results_['mean_test_average_precision'])) coefs = pd.DataFrame(list( zip(model_df.columns, clf.best_estimator_.coef_[0])), columns=['app', 'coef']) logger.info('train features after regularization: {}'.format( (coefs['coef'] != 0).sum())) logger.debug('coefficients:\n{}'.format( coefs[coefs['coef'] != 0].sort_values( 'coef', ascending=False).to_string(index=False))) logger.debug('intercept: {}'.format(clf.best_estimator_.intercept_[0])) logger.info('train average precision: {}'.format( average_precision_score( response, clf.best_estimator_.predict_proba(model_df)[:, 1]))) return clf.best_estimator_
def fit(self, X, y): cs = np.concatenate([[1e6], l1_min_c(X, y, loss='log') * np.logspace(3, 0, num=self.K - 1)]) clf = LogisticRegression(C=1.0, penalty='l1', tol=1e-6, solver='liblinear', multi_class='ovr') self.coef_path_ = [] self.intercepts_ = [] self.alphas = [] n = self.K for c in cs: n -= 1 clf.set_params(C=c) clf.fit(X, y) if self.classes_ is None: self.classes_ = clf.classes_ coef = clf.coef_ intercept = clf.intercept_ if self.coef_ is None and (self.max_var <= 0 or np.sum( np.sum(np.abs(coef) > 1e-4, axis=0) > 0) <= self.max_var): self.coef_ = coef self.intercept_ = intercept self.current_index = n self.coef_path_.append(coef.copy()) self.intercepts_.append(intercept) self.alphas.append(1.0 / c) if self.coef_ is None: # shouldn't happen but ya never know self.coef_ = clf.coef_ self.intercept_ = clf.intercept_ self.coef_path_ = list(reversed(self.coef_path_)) self.intercepts_ = list(reversed(self.intercepts_)) self.alphas = list(reversed(self.alphas)) return self
def pen_logi_reg(covariates, response, penalty='l1', xlabels=test_keywords): clf = LogisticRegression(penalty='l1', solver='saga', tol=1e-6, max_iter=int(1e6), warm_start=True, fit_intercept=True) cvAcc = list() coefs_ = list() cs = l1_min_c(covariates, response, loss='log') * np.logspace(0, 2, 16) for c in cs: clf.set_params(C=c) # clf.fit(X, y) # coefs_.append(clf.coef_.ravel().copy()) scores = cross_val_score(clf, covariates, response, cv=5) cvAcc.append(np.mean(scores)) # print(cvAcc) cvAcc = np.array(cvAcc) min = np.amax(cvAcc) pos = np.where(cvAcc == min) clf.set_params(C=cs.item(pos[0].item(0))) clf.fit(covariates, response) coefs_ = clf.coef_.ravel().copy() print('Model coefficients: ') print(coefs_) # plt.xticks(np.arange(len(coefs_)),xlabels) plt.plot(np.arange(len(coefs_)), coefs_) plt.title('Model coefficients') plt.xlabel('Keywords') plt.ylabel('Coefficients') # plt.plot(np.log10(cs), cvAcc, marker='o') # plt.xlabel(test_keywords) print(np.where(np.abs(coefs_) >= 1e-4)) plt.show() return clf, cvAcc
def fit(self, L1=True, cs=None): """ Use scikit-learn's LogisticRegression model to fit the data :param L1: If True, use L1 penalty on the coefficients """ from sklearn.linear_model import LogisticRegression from sklearn.svm import l1_min_c F = np.vstack([d["F"] for d in self.data_list]) S = np.vstack([d["S"] for d in self.data_list]) # Hold out some data for cross validation offset = int(0.75 * S.shape[0]) T_xv = S.shape[0] - offset F_xv = F[offset:, ...] S_xv = S[offset:, ...] augmented_xv_data = {"T": T_xv, "S": S_xv, "F": F_xv} F = F[:offset, ...] S = S[:offset, ...] # Get regularization path for inverse penalty C if cs is None: if L1: cs = l1_min_c(F, S[:,0], loss='log') * np.logspace(1, 6., 10) else: cs = np.logspace(-5,1,10) # cs = sigmas # The intercept is also subject to penalization, even though # we don't really want to penalize it. To counteract this effect, # we scale the intercept by a large value intercept_scaling = 1 penalty = "l1" if L1 else "l2" for n_post in xrange(self.N): print "Computing regularization path for neuron %d ..." % n_post ints = [] coeffs = [] xv_scores = [] lr = LogisticRegression(C=1.0, penalty=penalty, fit_intercept=True, intercept_scaling=intercept_scaling, tol=1e-6) for c in cs: print "Fitting for C=%.5f" % c lr.set_params(C=c) lr.fit(F, S[:,n_post]) ints.append(lr.intercept_.copy()) coeffs.append(lr.coef_.ravel().copy()) # xv_scores.append(lr.score(F_xv, S_xv[:,n_post]).copy()) # Temporarily set the weights and bias self.b[n_post] = lr.intercept_ self.weights[n_post, :] = lr.coef_ xv_scores.append(self.heldout_log_likelihood(augmented_data=augmented_xv_data)) # Choose the regularization penalty with cross validation print "XV Scores: " for c,score in zip(cs, xv_scores): print "\tc: %.5f\tscore: %.1f" % (c,score) best = np.argmax(xv_scores) print "Best c: ", cs[best] # Save the best weights self.b[n_post] = ints[best] self.weights[n_post, :] = coeffs[best] print " Max w: ", self.weights[n_post].max(), \ " Min w: ", self.weights[n_post].min() assert abs(self.weights[n_post]).max() > 1e-6 print ""
import load_data_ext as ld_ext import load_data_mi_ext_new as ldmi_ext np.random.seed(10) ############################################################################ # Quick logistic regression with lasso penalty, chosen with cross validation # website used for much code: # Initial random model mod1 = LogisticRegression(C=0.5, penalty='l1') # Smallest value of C before all coefficients set to zero min_l1_C = l1_min_c(ld_ext.train1.ix[:, 0:229], ld_ext.train1.ix[:, 229]) '%f' % min_l1_C # 0.000028 ~= 0.00003 #create candidate values of C c_vals = min_l1_C * np.logspace(0, 4, 15) # Create a dictionary whose keys are the candidate values of C. # The dictionary will hold the error rates in each CV trial for that # value of C. cdict = {} for c in c_vals: cdict[c] = [] # Cross validation to choose c. train1 and test1 already have randomized rows from train_test_split # Genaerate indicies to split data into 50 chunks
def fit(self,xw,xwl2,y,gs=4,model_type='logit',verbose=True): self.verbose = verbose if model_type=='logit': clf = LogisticRegression(C=1,class_weight='balanced',penalty='l2',max_iter=300) else: #clf = SVC(kernel='linear', class_weight='balanced', C=.1,probability=False) clf = SVC(C=1.,cache_size=500,kernel='linear',class_weight='balanced',probability=False) ''' # wrapper feature selection rfecv = RFECV(estimator=clf, step=1, cv=StratifiedKFold(y, 3), scoring='f1')#accuracy rfecv.fit(xw, y) print("Optimal number of features : %d" % rfecv.n_features_) print("ids: {}".format((rfecv.ranking_<=5).sum())) print rfecv.grid_scores_ self.rfecv = rfecv if rfecv.support_.sum()>10: self.w_select = rfecv.support_ else: self.w_select = rfecv.ranking_<=10 ''' #xw = [:,self.w_select] #self.mask_selection = (np.ones((1,xw.shape[1]))==1)[0,:] ## Optimize the hyper parameters # Stage 1 #param_grid = dict(C=(np.array([5,3,1]))) if model_type=='logit': param_grid = dict(C=(10**np.arange(1.,-2.,-0.5))) #param_grid = dict(C=(np.logspace(-.2, 1., 15))) #param_grid = dict(C=(np.arange(3,1,-0.5))) else: param_grid = dict(C=(np.arange(3.5,0.,-0.5))) param_grid = dict(C=(1.,1.00001)) #param_grid = dict(C=(np.logspace(-1.5, 0, 10))) #param_grid = dict(C=(np.arange(2.,0.5,-0.05))) #param_grid = dict(C=(np.array([0.01, 0.1, 1, 10, 100, 1000]))) gridclf = GridSearchCV(clf, param_grid=param_grid, cv=StratifiedKFold(y,n_folds=gs), n_jobs=-1,scoring='accuracy') gridclf.fit(xw,y) self.clf1 = gridclf.best_estimator_ if self.verbose: print self.clf1 print self.clf1.coef_ #hm_y,y_pred_train = self.estimate_hitmiss(xw,y) hm_y,proba = self.suffle_hm(xw,y,gamma=.9,n_iter=100) print 'Stage 2' #Stage 2 min_c = l1_min_c(xwl2,hm_y,loss='log') print 'minimum c: ',min_c #clf2 = LogisticRegression(C=10**0.1,class_weight=None,penalty='l2',solver='sag') #clf2 = LogisticRegression(C=1,class_weight=None,penalty='l2',solver='sag',max_iter=300) #clf2 = LinearSVC(class_weight='balanced',penalty='l1',dual=False) clf2 = LogisticRegression(C=1.,class_weight='balanced',penalty='l1',solver='liblinear',max_iter=300) #clf2 = LogisticRegression(C=1.,class_weight='balanced',penalty='l2',solver='sag',max_iter=300) #clf2 = SVC(C=1.,cache_size=500,kernel='linear',class_weight='balanced') #param_grid = dict(C=(10**np.arange(1.,-2.,-0.5))) #param_grid = dict(C=(np.arange(3,1,-0.5))) #param_grid = dict(C=(np.logspace(-0.5, 2., 30))) #param_grid = dict(C=(np.logspace(1., 1.6, 30))) param_grid = dict(C=(np.logspace(-.2, 1., 15))) #param_grid = dict(C=(np.logspace(np.log10(min_c), 0., 15))) #param_grid = dict(C=(1,1.0001)) # 2 levels balancing ''' new_classes = np.zeros_like(y) new_classes[(y==0) & (hm_y==0)]=0 new_classes[(y==1) & (hm_y==0)]=1 new_classes[(y==0) & (hm_y==1)]=2 new_classes[(y==1) & (hm_y==1)]=3 tmp_samp_w = len(new_classes) / (len(np.unique(new_classes))*1. * np.bincount(new_classes)) tmp_samp_w = (1.*(tmp_samp_w/tmp_samp_w.sum())) sample_w = new_classes.copy().astype(float) sample_w[new_classes==0] = tmp_samp_w[0] sample_w[new_classes==1] = tmp_samp_w[1] sample_w[new_classes==2] = tmp_samp_w[2] sample_w[new_classes==3] = tmp_samp_w[3] ''' #gridclf = GridSearchCV(clf2, param_grid=param_grid, cv=StratifiedKFold(hm_y,n_folds=gs),fit_params=dict(sample_weight=sample_w), n_jobs=-1,scoring='accuracy') #gridclf = GridSearchCV(clf2, param_grid=param_grid, cv=StratifiedKFold(hm_y,n_folds=gs),fit_params=dict(sample_weight=proba), n_jobs=-1,scoring='accuracy') gridclf = GridSearchCV(clf2, param_grid=param_grid, cv=StratifiedKFold(hm_y,n_folds=gs), n_jobs=-1,scoring='accuracy') gridclf.fit(xwl2,hm_y) clf2 = gridclf.best_estimator_ #clf2.fit(xw[train_index,:][:,idx_sz],hm_y) if self.verbose: print clf2 print clf2.coef_ self.clf2 = clf2
def train_and_val(features_train,labels_train,features_val,labels_val, ml_engine, feature_group = None): ''' Parameters ---------- features_train : numpy 2d array. training feature matrix labels_train : numpy 1d array. training labels. 0 for negative and 1 for positive features_val : numpy 2d array. validation feature matrix labels_val : numpy 1d array. validation labels. 0 for negative and 1 for positive ml_engine: string. machine learning engine to use. Possible choice : 1) 'lrlasso': logistic lasso 2) 'lglasso' logistic group lasso 3) 'lgen': logistic elastic net 4) 'grrf' : guided regularized random forest 5) 'lsvm': linear support vector machine 6) 'lgpcc': logistic regression + pearson correlation coefficient Returns ------- best_model : the best model selected from validation best_param : the best hyper-parameters from validation feature_num : number of selected features from best model ''' # Train model and get predicted score if ml_engine == "lrlasso" or ml_engine == "lglasso": # Convert to R objects features_train = BASE.as_matrix(features_train) features_val = BASE.as_matrix(features_val) # gglasso requires negative class to have label '-1'. Replace 0 with -1 # For convenience of comparison, labels_val will not be converted to R object labels_train[labels_train == 0] = -1 labels_val[labels_val == 0] = -1 labels_train = BASE.as_vector(labels_train) labels_val = labels_val.reshape(labels_val.shape[0],1) # To avoid name conflict with python keyword 'lambda', use dictionary to pass function argument if ml_engine == "lrlasso": args = {'x':features_train, 'y':labels_train, 'loss':'logit', 'lambda.factor':0.01} elif ml_engine == "lglasso": args = {'x':features_train, 'y':labels_train, 'group':BASE.as_vector(feature_group), 'loss':'logit', 'lambda.factor':0.01} # Train model on training data set best_model = GGL.gglasso(**args) # Predict on validation data set pred = GGL.predict_gglasso(best_model,type = 'class',newx = features_val) pred = np.array(pred) # Get sequence of lambdas lambda_seq = np.array(best_model[best_model.names.index('lambda')]) # Get the lambda which gives highest accuracy on validation dataset best_idx = np.argmax(np.sum(pred == labels_val,axis = 0)) best_param = lambda_seq[best_idx] # Get number of selected features from the best model coef = np.array(best_model[best_model.names.index("beta")])[:,best_idx] feature_num = coef[coef != 0].shape[0] elif ml_engine == "lgen": # Generate sequence of two parameters: alpha and l1_ratio alpha_list = np.logspace(-3,3,5) l1_ratio_list = np.linspace(0,1,5) best_acc = 0 best_param = None best_model = None # Test the different alpha and l1_ratio on validation dataset for alpha in alpha_list: for l1_ratio in l1_ratio_list: lgen = SGDClassifier(loss = 'log',penalty = 'elasticnet', alpha = alpha, l1_ratio = l1_ratio).fit(features_train, labels_train) pred = lgen.predict(features_val) acc = np.sum(labels_val == pred) if acc > best_acc: best_acc = acc best_param = (alpha,l1_ratio) best_model = lgen # Get number of selected features from the best model feature_num = best_model.coef_[best_model.coef_ != 0].shape[0] elif ml_engine == "grrf": # Convert to R objects features_train = BASE.as_matrix(features_train) features_val = BASE.as_matrix(features_val) labels_train = BASE.as_vector(labels_train) rf = RRF.RRF(features_train,BASE.as_factor(labels_train), flagReg = 0, ntree = 100) # build an ordinary RF # Get importance score imp = rf[rf.names.index("importance")] imp = np.array(imp) imp = imp/(np.max(imp)) best_acc = 0 best_param = None best_model = None # Test different gamma on validation dataset for gamma in (0,0.5,1): coefReg = (1-gamma) + gamma*imp coefReg = FloatVector(coefReg) grrf = RRF.RRF(features_train,BASE.as_factor(labels_train), flagReg=1, coefReg=coefReg, ntree = 100) pred = np.array(RRF.predict_RRF(grrf, features_val)) - 1 acc = np.sum(labels_val == pred) if acc > best_acc: best_acc = acc best_param = gamma best_model = grrf # Get number of selected features from the best model feature_num = np.array(best_model[best_model.names.index("feaSet")]).shape[0] elif ml_engine == "lsvm": ''' Calculate the lower bound of C for a null model If C goes smaller than this value, the model would end up selecting no features ''' min_c = l1_min_c(features_train, labels_train) # log spaced list of C parameters c_list = np.logspace(np.log10(min_c),3,10) best_acc = 0 best_param = None best_model = None # Train on training dataset, validate each C on validation dataset for C in c_list: svm = LinearSVC(C = C,penalty = 'l1',dual=False).fit(features_train,labels_train) pred = svm.predict(features_val) acc = np.sum(labels_val == pred) if acc > best_acc: best_acc = acc best_param = C best_model = svm # Get number of selected features from the best model feature_num = best_model.coef_[best_model.coef_ != 0].shape[0] elif ml_engine == "lgpcc": cor_coef = [] for i in range(features_train.shape[1]): x = features_train[:,i].astype(np.float32) y = labels_train.astype(np.float32) cor_coef.append(pearsonr(x,y)[0]) cor_coef = np.array(cor_coef) order = np.argsort(np.abs(cor_coef))[-1:0:-1] to_include = np.arange(50,order.shape[0],50) best_acc = 0 best_param = None best_model = None for i in to_include: lg = LogisticRegression().fit(features_train[:,order[:i]],labels_train) pred = lg.predict(features_val[:,order[:i]]) acc = np.sum(pred == labels_val) if acc > best_acc: best_acc = acc best_param = order[:i] best_model = lg # Get number of selected features from the best model feature_num = best_model.coef_[best_model.coef_ != 0].shape[0] return(best_model,best_param,feature_num)
X = np.array([np.array(xi) for xi in X]) # Add subject number as feature #X = np.hstack([X, trials['subject'].reshape(-1,1)]) y = trials['condition'] == 'win_event' y = y.astype(int) X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2) # Create model model = LogisticRegression(random_state=0, penalty='l1', solver='liblinear', tol=1e-6, max_iter=int(1e6), warm_start=True, intercept_scaling=10000.) # Regularization parameter cs = l1_min_c(X_train, y_train, loss='log') * np.logspace(0, 3, 16) coefs_ = [] accs = [] # Iterate over smaller subset for parameter search to reduce computational time X_train_subset = X_train[:5000] Y_train_subset = y_train[:5000] for c in cs: model.set_params(C=c) model.fit(X_train_subset, Y_train_subset) coefs_.append(model.coef_.ravel().copy()) accs.append(model.score(X_test, y_test)) # Plot coefs coefs_ = np.array(coefs_) plt.plot(np.log10(cs), coefs_, marker='o') ymin, ymax = plt.ylim()
f1_tests = list() acc_trains = list() acc_tests = list() for i in range(10): # Sample and split data sampls = get_group_samples(scenario, feat_keys, n_labl, sample_size, undersampling=True) feat_train = np.vstack([s[0] for s in sampls[1:]]) labl_train = np.hstack([s[1] for s in sampls[1:]]) # feat_train = np.vstack([s[0] for s in [sampls[0]] + sampls[2:]]) # labl_train = np.hstack([s[1] for s in [sampls[0]] + sampls[2:]]) c = (l1_min_c(feat_train, labl_train, loss='log') * np.logspace(0, 4, 5)).tolist()[1] # if sample_size > 100: # Fix for PH-Breuer # c = 0.02761796 # Learn identifier idf = logReg( random_state=False, fit_intercept=False, class_weight='none', max_iter=max_iter, penalty=penalty, solver=solver, C=c, l1_ratio=l1_ratio, # 0.0=l2, 1.0=l1 verbose=False, n_jobs=-1,
def get_C_grid(X, y): c_grid = l1_min_c(X, y, loss='log') * np.logspace(0, 3, 100) return c_grid
from sklearn import datasets from sklearn.svm import l1_min_c iris = datasets.load_iris() X = iris.data y = iris.target X = X[y != 2] y = y[y != 2] X /= X.max() # Normalize X to speed-up convergence # ############################################################################# # Demo path functions cs = l1_min_c(X, y, loss="log") * np.logspace(0, 7, 16) print("Computing regularization path ...") start = time() clf = linear_model.LogisticRegression( penalty="l1", solver="liblinear", tol=1e-6, max_iter=int(1e6), warm_start=True, intercept_scaling=10000.0, ) coefs_ = [] for c in cs: clf.set_params(C=c) clf.fit(X, y)
regenerate_tsfresh = True if regenerate_tsfresh: print('Generating tsfresh data...') settings = EfficientFCParameters() audio_tsfresh = extract_relevant_features(all_audio, all_labels, column_id='file_id', column_sort='time_id', default_fc_parameters=settings) else: print('Reading tsfresh data...') all_labels = pd.read_pickle('pkl/drum_tsfresh_labels.pkl') audio_tsfresh = pd.read_pickle('pkl/drum_tsfresh.pkl') print('Running logistic regression CV...') print('Started CV %s' % datetime.now()) cs = l1_min_c(audio_tsfresh, all_labels, loss='log') * np.logspace(0, 7, 16) cv_result = LogisticRegressionCV(Cs=cs, penalty='l1', multi_class='ovr', solver='saga', tol=1e-6, max_iter=int(1e6), n_jobs=-1).fit(audio_tsfresh, all_labels) print('Done CV %s' % datetime.now()) print('Dumping results...') Path("pkl").mkdir(exist_ok=True) all_labels.to_pickle('pkl/drum_tsfresh_labels.pkl') audio_tsfresh.to_pickle('pkl/drum_tsfresh.pkl') dump(cv_result, 'pkl/drum_logreg_cv.joblib')
gc.collect() # --------------------------------------------------------------------- # Grid Search # Scaling ss = StandardScaler() mm = MinMaxScaler() ss.fit(pd.concat([train_x, test_x], axis=0)) train_x_s = ss.transform(train_x) test_x_s = ss.transform(test_x) mm.fit(pd.concat([train_x, test_x], axis=0)) train_x_m = mm.transform(train_x) test_x_m = mm.transform(test_x) cs = l1_min_c(X, y, loss='log') # lower limit of 'c' in L1 regression param_grid = {'penalty': ['l1'], 'C': [0.1, 0.2]} grid_cv_logit = GridSearchCV( LogisticRegression( solver='saga', # L1 : sag, L2 :saga, both algo need Scaling random_state=SEED, n_jobs=1), param_grid=param_grid, scoring='roc_auc', n_jobs=CPU, cv=StratifiedKFold(n_splits=5, shuffle=True, random_state=SEED), verbose=1, ) grid_cv_logit.fit(train_x_s, train_y.values.reshape(-1, )) # grid_cv_logit.fit(train_x_m, train_y.values.reshape(-1, ))
from sklearn import datasets from sklearn.svm import l1_min_c iris = datasets.load_iris() X = iris.data y = iris.target X = X[y != 2] y = y[y != 2] X -= np.mean(X, 0) ############################################################################### # Demo path function cs = l1_min_c(X, y, loss="log") * np.logspace(0, 3) print("Computing regularization path ...") start = datetime.now() clf = linear_model.LogisticRegression(C=1.0, penalty="l1", tol=1e-6) coefs_ = [] for c in cs: clf.set_params(C=c) clf.fit(X, y) coefs_.append(clf.coef_.ravel().copy()) print("This took %s" % (datetime.now() - start)) coefs_ = np.array(coefs_) plt.plot(np.log10(cs), coefs_) ymin, ymax = plt.ylim() plt.xlabel("log(C)")
def trainModel(self, do_pca = False,out_dir='./cache', rftop = 40, class_labels = SP.array(['G1','S','G2M']), cv=10, npc=3, is_SVM=1, is_RFE=0 , scale=False): if not os.path.exists(out_dir): os.makedirs(out_dir) CFG = {} CFG['is_RFE'] = is_RFE # use recursive feature selection (can be slow for large datasets) CFG['is_SVM'] = is_SVM # use SVM with univariate feature selection (faster than RFE) CFG['CV_inner'] = cv #inner CV for RFE_CV: either an int or 'LOOCV' CFG['out_dir'] = out_dir CFG['do_pca'] = do_pca CFG['lassotop'] = 20 self.cv = cv Y = self.Y labels = self.labels var_names = self.geneNames numClasses = self.numClasses predRF = SP.zeros((len(labels),numClasses)) predSVM = SP.zeros((len(labels),numClasses)) predSVMrbf = SP.zeros((len(labels),numClasses)) predGNB = SP.zeros((len(labels),numClasses)) predLR = SP.zeros((len(labels),numClasses)) predLRall = SP.zeros((len(labels),numClasses)) names_dict={} if self.cv == 'LOOCV': loo = LeaveOneOut(len(labels)) CV_list = (list(iter(loo))) CV_list.append((SP.array(range(Y.shape[0])), SP.array(range(Y.shape[0]))))#all data... else: skf = StratifiedKFold(labels, n_folds=self.cv) CV_list = (list(iter(skf))) CV_list.append((SP.array(range(Y.shape[0])), SP.array(range(Y.shape[0]))))#all data... lambda_best = SP.zeros((1,len(CV_list))).ravel() print("Performing cross validation ...") for i in range(len(CV_list)): if i<len(CV_list)-1: print("Fold " + str(i+1) + " of " + str(len(CV_list)-1)) else: print("Final model") # string label for this fold #get data of a CV run cv_tr = CV_list[i][0] cv_tst = CV_list[i][1] lab_tr = labels[cv_tr] Ytr = Y[cv_tr,:] Ytst = Y[cv_tst,:] lab_tst = labels[cv_tst] if (i==len(CV_list)-1): foldlabel = 'full' if (self.Y_tst==None): Ytst = Y[cv_tst,:] lab_tst = labels[cv_tst] else: foldlabel = 'Test' Ytst = self.Y_tst lab_tst = self.labels_tst else: foldlabel = str(i) if do_pca>=1: npc = npc#3 #do PCA to get features pcaCC = PCA(n_components=npc, whiten=False) pcaCC.fit(Ytr) pcaTst=pcaCC.transform(Ytst) pcaTr=pcaCC.transform(Ytr) #selection = SelectKBest(k=1) #combined_features = FeatureUnion([("pca", pcaCC), ("univ_select", selection)]) combined_features = FeatureUnion([("pca", pcaCC)]) gnb = GaussianNB() y_pred = gnb.fit(pcaTr, lab_tr).predict_proba(pcaTst) if i<len(CV_list)-1: predGNB[cv_tst,:] =y_pred#[:,1] else: predGNB_ts = y_pred#[:,1] if do_pca==2: Ytr = SP.concatenate((Ytr, pcaTr),1) Ytst = SP.concatenate((Ytst, pcaTst),1) pcnames = [] for pci in range(npc): pcnames.append('PC'+str(pci+1)) var_names = SP.concatenate((var_names, SP.array(pcnames)),1) print(" Computing random forest ...") if CFG['is_RFE']==1:#Recursive feature selection with SVM print(" Computing RFE with SVM ...") svc = SVC(kernel="linear", probability=False, class_weight='auto')#use linear SVM for selection rfecv = RFECV(estimator=svc, step=1,scoring='f1') param_grid = dict(estimator__C=[0.1, 1, 10, 100, 1000]) clf_rfe = GridSearchCV(rfecv, param_grid=param_grid, cv=3, scoring='f1')#GridSearch to find optimal parameters clf_rfe.fit(Ytr, lab_tr) svc = SVC(kernel="linear", probability=False,C=clf_rfe.best_estimator_.estimator.C, class_weight='auto')#use linear SVM for selection if CFG['CV_inner']=='': rfecv = RFECV(estimator=svc, step=1,scoring='f1') elif CFG['CV_inner']=='LOOCV': rfecv = RFECV(estimator=svc, step=1,scoring='f1', cv=LeaveOneOut(len(lab_tr))) else: rfecv = RFECV(estimator=svc, step=1,scoring='f1', cv=StratifiedKFold(lab_tr, n_folds=CFG['CV_inner'])) clf_rfe.best_estimator_.fit(Ytr, lab_tr) predicted = clf_rfe.best_estimator_.predict(Ytst) if i<len(CV_list)-1: predSVM[cv_tst,:] = predicted else: predSVM_ts[cv_tst] = predicted classifier = svm.SVC(kernel='rbf', gamma=0.05, class_weight='auto', probability=True)#rbf kernel for prediction param_grid = dict(C=[0.1, 1], gamma=[1e-1,1e-2,1e-3]) clf_rbf = GridSearchCV(classifier, param_grid=param_grid, cv=3, scoring='f1') clf_rbf.fit(Ytr[:,clf_rfe.best_estimator_.ranking_==1], lab_tr) clf_rbf.best_estimator_.fit(Ytr[:,clf_rfe.best_estimator_.ranking_==1], lab_tr) predicted = clf_rbf.best_estimator_.predict_proba(Ytst[:,clf_rfe.best_estimator_.ranking_==1]) if i<len(CV_list)-1: predSVMrbf[cv_tst,:] = predicted fpr, tpr, thresholds = metrics.roc_curve(lab_tst, predicted[:,1]) if (i==len(CV_list)-1) | CFG["CV_plots"]>0: PL.figure() PL.plot(fpr, tpr) PL.savefig(CFG['out_dir']+'/RF_SVM_'+foldlabel+'.pdf') names_dict[foldlabel+'_SVM']=self.geneNames[clf_rfe.best_estimator_.ranking_==1] elif CFG['is_SVM']==1:#univariate FS with rbf SVM; choose this if you hava a large data set (many features, eg RNAseq) print(" SVM feature selection ...") classifier = svm.SVC(kernel='rbf', gamma=0.05, class_weight='auto', probability=True) selection = SelectKBest(k=1) combined_features = FeatureUnion([("univ_select", selection)]) X_features = combined_features.fit(Ytr, lab_tr).transform(Ytr) scaler = preprocessing.StandardScaler().fit(Ytr) YtrS = scaler.transform(Ytr) YtstS = scaler.transform(Ytst) classifier.fit(X_features, lab_tr) pipeline = Pipeline([("features", combined_features), ("svm", classifier)]) if CFG['do_pca']==3: param_grid = dict(features__pca__n_components=SP.unique(SP.round_(SP.logspace(1.0,max(SP.log2(Ytr.shape[1]), SP.log2(10)),num=min(5,Ytr.shape[1]),base=2.0))), features__univ_select__k=SP.unique(SP.round_(SP.logspace(3.0,SP.log2(Ytr.shape[1]),num=min(10,Ytr.shape[1]),base=2.0))), svm__C=[0.1, 1, 10], svm__gamma=[1e-1,1e-2,1e-3]) else: C_range = 10. ** SP.arange(0, 2) gamma_range = 10. ** SP.arange(-5, 1) param_grid = dict(features__univ_select__k=SP.unique(SP.round_(SP.logspace(3.0,SP.log2(Ytr.shape[1]),num=min(10,Ytr.shape[1]),base=2.0))), svm__C=C_range, svm__gamma=gamma_range) clf = GridSearchCV(pipeline, param_grid=param_grid, cv=5, scoring='f1') clf.fit(YtrS, lab_tr) print("The best classifier is: ", clf.best_estimator_) select_best=clf.best_estimator_.get_params()['features__univ_select'] #names_dict[foldlabel+'_SVM']=self.geneNames[SP.argsort(-1.0*select_best.scores_)[0:(select_best.k-1)]] expected = lab_tst predicted = clf.best_estimator_.predict_proba(YtstS) if i<len(CV_list)-1: predSVM[cv_tst,:] = predicted else: predSVM_ts = predicted #print(clf.best_estimator_) classifier = svm.SVC(kernel='rbf', gamma=0.05, class_weight='auto', probability=True)#rbf kernel for prediction param_grid = dict(C=[1,10], gamma=[ 1e-1,1e-2,1e-3]) clf_rbf = GridSearchCV(classifier, param_grid=param_grid, cv=5, scoring='f1') clf_rbf.fit(Ytr, lab_tr) clf_rbf.best_estimator_.fit(Ytr, lab_tr) predicted = clf_rbf.best_estimator_.predict_proba(Ytst) if i<len(CV_list)-1: predSVMrbf[cv_tst,:] = predicted else: predSVMrbf_ts = predicted #do lasso with regularisation path cs = l1_min_c(Ytr, lab_tr, loss='log') * SP.logspace(0, 3) print(" Computing regularization path ...") lasso = linear_model.LogisticRegression(C=cs[0]*10.0, penalty='l1', tol=1e-6) param_grid = dict(C=cs) clf_lr = GridSearchCV(lasso, param_grid=param_grid, cv=5, scoring='f1') clf_lr.fit(Ytr, lab_tr) clf_lr.best_estimator_.fit(Ytr, lab_tr) lambda_best[i] = clf_lr.best_params_.get('C') predicted = clf_lr.best_estimator_.predict_proba(Ytst) clf = linear_model.LogisticRegression(C=cs[0]*10.0, penalty='l1', tol=1e-6) coefs_ = [] for c in cs: clf.set_params(C=c) clf.fit(Ytr, lab_tr) coefs_.append(clf.coef_.ravel().copy()) if i<len(CV_list)-1: predLR[cv_tst,:] = predicted else: predLR_ts = predicted coefs_ = SP.array(coefs_) # get ordering by importance (how many times they appear) order=(coefs_!=0).sum(axis=0).argsort() order=order[::-1] # descending # store this order featrank_lasso = order showtop= min(Ytr.shape[1], CFG['lassotop']) clfAll = linear_model.LogisticRegression(C=1e5, penalty='l2', tol=1e-6) clfAll.fit(Ytr, lab_tr) predicted = clfAll.predict_proba(Ytst) if i<len(CV_list)-1: predLRall[cv_tst,:] = predicted else: predLRall_ts = predicted forest = ExtraTreesClassifier(n_estimators=500, random_state=0, criterion="entropy", bootstrap=False) #forest = RandomForestClassifier(n_estimators=500, # random_state=0, criterion="entropy") forest.fit(Ytr, lab_tr) pred = forest.predict_proba(Ytst) #pdb.set_trace() if i<len(CV_list)-1: predRF[cv_tst,:] = pred#[:,1] else: predRF_ts = pred#[:,1] importances = forest.feature_importances_ std = SP.std([tree.feature_importances_ for tree in forest.estimators_], axis=0) topfeat=min(Ytr.shape[1], rftop) indices = SP.argsort(importances)[::-1][0:topfeat] # store full feature ranking featrank_rf = SP.argsort(importances)[::-1] # Plot the feature importances of the forest if (i==len(CV_list)-1): PL.figure() #PL.title("Feature importances, Fold "+foldddPPlabel+', AUC='+str(SP.round_(metrics.auc(fpr, tpr),3))) PL.title("Feature importances") #PL.bar(range(topfeat), importances[indices],color="r", yerr=std[indices], align="center") PL.bar(range(topfeat), importances[indices],color="r", align="center") PL.xticks(range(topfeat), indices, rotation=70) PL.gca().set_xticklabels(var_names[indices]) PL.setp(PL.gca().get_xticklabels(), fontsize=8) PL.xlim([-1, topfeat]) PL.savefig(out_dir+'/RF_featureimportance_'+foldlabel+'.pdf') f2 = open(os.path.join(out_dir,'classification_reportCV.txt') ,'w') predRFv = SP.argmax(predRF_ts,axis=1)+1 predRF_trv = SP.argmax(predRF,axis=1)+1 self.scores = predRF self.scores_tst = predRF_ts self.ranking = var_names[indices] predLRv = SP.argmax(predLR_ts,axis=1)+1 predLR_trv = SP.argmax(predLR,axis=1)+1 self.scoresLR = predLR self.scoresLR_tst = predLR_ts predLRallv = SP.argmax(predLRall_ts,axis=1)+1 predLRall_trv = SP.argmax(predLRall,axis=1)+1 self.scoresLRall = predLRall self.scoresLRall_tst = predLRall_ts if CFG['is_SVM']==1: predSVMv = SP.argmax(predSVM_ts,axis=1)+1 predSVM_trv = SP.argmax(predSVM,axis=1)+1 self.scoresSVM = predSVM self.scoresSVM_tst = predSVM_ts predSVMrbfv = SP.argmax(predSVMrbf_ts,axis=1)+1 predSVMrbf_trv = SP.argmax(predSVMrbf,axis=1)+1 self.scoresSVMrbf = predSVMrbf self.scoresSVMrbf_tst = predSVMrbf_ts predGNBv = SP.argmax(predGNB_ts,axis=1)+1 predGNB_trv = SP.argmax(predGNB,axis=1)+1 self.scoresGNB = predGNB self.scoresGNB_tst = predGNB_ts print("Classification report for classifier %s:\n%s\n" % ('Gaussian Naive Bayes', metrics.classification_report(self.labels, predGNB_trv))) print("Classification report for classifier %s:\n%s\n" % ('Random Forest', metrics.classification_report(self.labels, predRF_trv))) print("Classification report for classifier %s:\n%s\n" % ('LR', metrics.classification_report(self.labels, predLR_trv))) print("Classification report for classifier %s:\n%s\n" % ('LRall', metrics.classification_report(self.labels, predLRall_trv))) if CFG['is_RFE']==1: print("Classification report for classifier %s:\n%s\n" % ('SVM ', metrics.classification_report(labels, predSVM>0.5)),file=f2) elif CFG['is_SVM']==1: print("Classification report for classifier %s:\n%s\n" % ('SVM', metrics.classification_report(self.labels, predSVM_trv))) print("Classification report for classifier %s:\n%s\n" % ('SVMrbf', metrics.classification_report(self.labels, predSVMrbf_trv))) f2.close()
def fit(self,xw,xwl2,y,gs=4,retrain_l1=False): print 'Stage 1' if self.stage1_model_type == 'logit': clf = LogisticRegression(C=1,class_weight='balanced',penalty='l2',max_iter=300) elif self.stage1_model_type == 'svm': #clf = SVC(kernel='linear', class_weight='balanced', C=.1,probability=False) clf = SVC(C=1.,cache_size=500,kernel='linear',class_weight='balanced',probability=False) elif self.stage1_model_type == 'rf': clf = RandomForestClassifier(n_estimators=20,class_weight='balanced') # Stage 1 #param_grid = dict(C=(np.array([5,3,1]))) if self.stage1_model_type == 'logit': #param_grid = dict(C=(10**np.arange(1.,-2.,-0.5))) #param_grid = dict(C=(np.logspace(-.2, 1., 15))) #param_grid = dict(C=(np.arange(3,1,-0.5))) param_grid = dict(C=(5,5.0001)) elif self.stage1_model_type =='svm': param_grid = dict(C=(np.arange(3.5,0.,-0.5))) param_grid = dict(C=(1.,1.00001)) #param_grid = dict(C=(np.logspace(-1.5, 0, 10))) #param_grid = dict(C=(np.arange(2.,0.5,-0.05))) #param_grid = dict(C=(np.array([0.01, 0.1, 1, 10, 100, 1000]))) elif self.stage1_model_type == 'rf': param_grid = dict(n_estimators=(20,10)) gridclf = GridSearchCV(clf, param_grid=param_grid, cv=StratifiedKFold(n_splits=gs), n_jobs=-1,scoring='accuracy') gridclf.fit(xw,y) self.clf1 = gridclf.best_estimator_ if self.verbose: print self.clf1 #print self.clf1.coef_ #hm_y,y_pred_train = self.estimate_hitmiss(xw,y) hm_y,proba = self.suffle_hm(xw,y,gamma=self.gamma,n_iter=100) hm_y,auto_gamma = self.auto_gamma(proba,self.gamma) self.auto_gamma = auto_gamma if self.verbose: proba if self.verbose: print 'Average hm score', np.mean(hm_y) #self.clf3 = SVC(C=1.,cache_size=500,kernel='linear',class_weight='balanced',probability=False) #gamma=0.5 #print 'n stage3 ',(proba>gamma).sum() #self.clf3.fit(xw[proba>gamma,:],y[proba>gamma]) #if retrain_l1: # self.clf1 = self.clf3 print 'Stage 2' #Stage 2 min_c = l1_min_c(xwl2,hm_y,loss='log') #clf2 = LogisticRegression(C=10**0.1,class_weight=None,penalty='l2',solver='sag') #clf2 = LogisticRegression(C=1,class_weight=None,penalty='l2',solver='sag',max_iter=300) #clf2 = LinearSVC(class_weight='balanced',penalty='l1',dual=False) clf2 = LogisticRegression(C=1.,class_weight='balanced',penalty='l1',solver='liblinear',max_iter=300) #clf2 = LogisticRegression(C=1.,class_weight='balanced',penalty='l2',solver='sag',max_iter=300) #clf2 = SVC(C=1.,cache_size=500,kernel='linear',class_weight='balanced') #clf2 = RandomForestClassifier(n_estimators=20,class_weight='balanced') #param_grid = dict(C=(10**np.arange(1.,-2.,-0.5))) #param_grid = dict(C=(np.arange(3,1,-0.5))) #param_grid = dict(C=(np.logspace(-0.5, 2., 30))) #param_grid = dict(C=(np.logspace(1., -2., 15))) #if min_c>(10**-0.2): # param_grid = dict(C=(np.logspace(np.log10(min_c), 1, 15))) #else: param_grid = dict(C=(np.logspace(-.2, 1, 15))) #param_grid = dict(C=(np.logspace(-.1, 0.5, 30))) #param_grid = dict(C=(np.logspace(0,0.00001, 2))) #param_grid = dict(C=(np.logspace(np.log10(min_c), 0., 15))) #param_grid = dict(C=(1,1.10001)) #param_grid = dict(n_estimators=(20,10)) # 2 levels balancing ''' new_classes = np.zeros_like(y) new_classes[(y==0) & (hm_y==0)]=0 new_classes[(y==1) & (hm_y==0)]=1 new_classes[(y==0) & (hm_y==1)]=2 new_classes[(y==1) & (hm_y==1)]=3 tmp_samp_w = len(new_classes) / (len(np.unique(new_classes))*1. * np.bincount(new_classes)) tmp_samp_w = (1.*(tmp_samp_w/tmp_samp_w.sum())) sample_w = new_classes.copy().astype(float) sample_w[new_classes==0] = tmp_samp_w[0] sample_w[new_classes==1] = tmp_samp_w[1] sample_w[new_classes==2] = tmp_samp_w[2] sample_w[new_classes==3] = tmp_samp_w[3] ''' new_classes = np.zeros_like(y) new_classes[(y==0) ]=0 new_classes[(y==1) ]=1 tmp_samp_w = len(new_classes) / (len(np.unique(new_classes))*1. * np.bincount(new_classes)) #tmp_samp_w = (1.*(tmp_samp_w/tmp_samp_w.sum())) sample_w = new_classes.copy().astype(float) sample_w[new_classes==0] = tmp_samp_w[0] sample_w[new_classes==1] = tmp_samp_w[1] #gridclf = GridSearchCV(clf2, param_grid=param_grid, cv=StratifiedKFold(hm_y,n_folds=gs),fit_params=dict(sample_weight=sample_w), n_jobs=-1,scoring='accuracy') #gridclf = GridSearchCV(clf2, param_grid=param_grid, cv=StratifiedKFold(hm_y,n_folds=gs),fit_params=dict(sample_weight=proba), n_jobs=-1,scoring='accuracy') #gridclf = GridSearchCV(clf2, param_grid=param_grid, cv=StratifiedKFold(hm_y,n_folds=gs), n_jobs=-1,scoring='precision_weighted') #gridclf = GridSearchCV(clf2, param_grid=param_grid, cv=StratifiedKFold(hm_y,n_folds=gs), n_jobs=-1,scoring='accuracy') #gridclf = GridSearchCV(clf2, param_grid=param_grid, cv=StratifiedShuffleSplit(hm_y, n_iter=50, test_size=.2,random_state=1), n_jobs=-1,scoring='accuracy')#f1_weighted #gridclf = GridSearchCV(clf2, param_grid=param_grid, cv=StratifiedShuffleSplit(hm_y, n_iter=50, test_size=.2,random_state=1), n_jobs=-1,scoring='f1_weighted') gridclf = GridSearchCV(clf2, param_grid=param_grid, cv=StratifiedShuffleSplit(n_splits=50, test_size=.2,random_state=1), n_jobs=-1,scoring='precision_weighted') #gridclf = GridSearchCV(clf2, param_grid=param_grid, cv=StratifiedShuffleSplit(hm_y, n_iter=50, test_size=.2,random_state=1), n_jobs=-1,fit_params=dict(sample_weight=sample_w),scoring='precision_weighted') gridclf.fit(xwl2,hm_y) clf2 = gridclf.best_estimator_ #clf2.fit(xw[train_index,:][:,idx_sz],hm_y) if self.verbose: print clf2 print clf2.coef_ self.clf2 = clf2
def trainModel(self, do_pca=False, out_dir='./cache', rftop=40, class_labels=SP.array(['G1', 'S', 'G2M']), cv=10, npc=3, is_SVM=1, is_RFE=0, scale=False): if not os.path.exists(out_dir): os.makedirs(out_dir) CFG = {} CFG['is_RFE'] = is_RFE # use recursive feature selection (can be slow for large datasets) CFG['is_SVM'] = is_SVM # use SVM with univariate feature selection (faster than RFE) CFG['CV_inner'] = cv #inner CV for RFE_CV: either an int or 'LOOCV' CFG['out_dir'] = out_dir CFG['do_pca'] = do_pca CFG['lassotop'] = 20 self.cv = cv Y = self.Y labels = self.labels var_names = self.geneNames numClasses = self.numClasses predRF = SP.zeros((len(labels), numClasses)) predSVM = SP.zeros((len(labels), numClasses)) predSVMrbf = SP.zeros((len(labels), numClasses)) predGNB = SP.zeros((len(labels), numClasses)) predLR = SP.zeros((len(labels), numClasses)) predLRall = SP.zeros((len(labels), numClasses)) names_dict = {} if self.cv == 'LOOCV': loo = LeaveOneOut(len(labels)) CV_list = (list(iter(loo))) CV_list.append((SP.array(range(Y.shape[0])), SP.array(range(Y.shape[0])))) #all data... else: skf = StratifiedKFold(labels, n_folds=self.cv) CV_list = (list(iter(skf))) CV_list.append((SP.array(range(Y.shape[0])), SP.array(range(Y.shape[0])))) #all data... lambda_best = SP.zeros((1, len(CV_list))).ravel() print("Performing cross validation ...") for i in range(len(CV_list)): if i < len(CV_list) - 1: print("Fold " + str(i + 1) + " of " + str(len(CV_list) - 1)) else: print("Final model") # string label for this fold #get data of a CV run cv_tr = CV_list[i][0] cv_tst = CV_list[i][1] lab_tr = labels[cv_tr] Ytr = Y[cv_tr, :] Ytst = Y[cv_tst, :] lab_tst = labels[cv_tst] if (i == len(CV_list) - 1): foldlabel = 'full' if (self.Y_tst == None): Ytst = Y[cv_tst, :] lab_tst = labels[cv_tst] else: foldlabel = 'Test' Ytst = self.Y_tst lab_tst = self.labels_tst else: foldlabel = str(i) if do_pca >= 1: npc = npc #3 #do PCA to get features pcaCC = PCA(n_components=npc, whiten=False) pcaCC.fit(Ytr) pcaTst = pcaCC.transform(Ytst) pcaTr = pcaCC.transform(Ytr) #selection = SelectKBest(k=1) #combined_features = FeatureUnion([("pca", pcaCC), ("univ_select", selection)]) combined_features = FeatureUnion([("pca", pcaCC)]) gnb = GaussianNB() y_pred = gnb.fit(pcaTr, lab_tr).predict_proba(pcaTst) if i < len(CV_list) - 1: predGNB[cv_tst, :] = y_pred #[:,1] else: predGNB_ts = y_pred #[:,1] if do_pca == 2: Ytr = SP.concatenate((Ytr, pcaTr), 1) Ytst = SP.concatenate((Ytst, pcaTst), 1) pcnames = [] for pci in range(npc): pcnames.append('PC' + str(pci + 1)) var_names = SP.concatenate((var_names, SP.array(pcnames)), 1) print(" Computing random forest ...") if CFG['is_RFE'] == 1: #Recursive feature selection with SVM print(" Computing RFE with SVM ...") svc = SVC(kernel="linear", probability=False, class_weight='auto') #use linear SVM for selection rfecv = RFECV(estimator=svc, step=1, scoring='f1') param_grid = dict(estimator__C=[0.1, 1, 10, 100, 1000]) clf_rfe = GridSearchCV( rfecv, param_grid=param_grid, cv=3, scoring='f1') #GridSearch to find optimal parameters clf_rfe.fit(Ytr, lab_tr) svc = SVC(kernel="linear", probability=False, C=clf_rfe.best_estimator_.estimator.C, class_weight='auto') #use linear SVM for selection if CFG['CV_inner'] == '': rfecv = RFECV(estimator=svc, step=1, scoring='f1') elif CFG['CV_inner'] == 'LOOCV': rfecv = RFECV(estimator=svc, step=1, scoring='f1', cv=LeaveOneOut(len(lab_tr))) else: rfecv = RFECV(estimator=svc, step=1, scoring='f1', cv=StratifiedKFold(lab_tr, n_folds=CFG['CV_inner'])) clf_rfe.best_estimator_.fit(Ytr, lab_tr) predicted = clf_rfe.best_estimator_.predict(Ytst) if i < len(CV_list) - 1: predSVM[cv_tst, :] = predicted else: predSVM_ts[cv_tst] = predicted classifier = svm.SVC( kernel='rbf', gamma=0.05, class_weight='auto', probability=True) #rbf kernel for prediction param_grid = dict(C=[0.1, 1], gamma=[1e-1, 1e-2, 1e-3]) clf_rbf = GridSearchCV(classifier, param_grid=param_grid, cv=3, scoring='f1') clf_rbf.fit(Ytr[:, clf_rfe.best_estimator_.ranking_ == 1], lab_tr) clf_rbf.best_estimator_.fit( Ytr[:, clf_rfe.best_estimator_.ranking_ == 1], lab_tr) predicted = clf_rbf.best_estimator_.predict_proba( Ytst[:, clf_rfe.best_estimator_.ranking_ == 1]) if i < len(CV_list) - 1: predSVMrbf[cv_tst, :] = predicted fpr, tpr, thresholds = metrics.roc_curve( lab_tst, predicted[:, 1]) if (i == len(CV_list) - 1) | CFG["CV_plots"] > 0: PL.figure() PL.plot(fpr, tpr) PL.savefig(CFG['out_dir'] + '/RF_SVM_' + foldlabel + '.pdf') names_dict[foldlabel + '_SVM'] = self.geneNames[ clf_rfe.best_estimator_.ranking_ == 1] elif CFG[ 'is_SVM'] == 1: #univariate FS with rbf SVM; choose this if you hava a large data set (many features, eg RNAseq) print(" SVM feature selection ...") classifier = svm.SVC(kernel='rbf', gamma=0.05, class_weight='auto', probability=True) selection = SelectKBest(k=1) combined_features = FeatureUnion([("univ_select", selection)]) X_features = combined_features.fit(Ytr, lab_tr).transform(Ytr) scaler = preprocessing.StandardScaler().fit(Ytr) YtrS = scaler.transform(Ytr) YtstS = scaler.transform(Ytst) classifier.fit(X_features, lab_tr) pipeline = Pipeline([("features", combined_features), ("svm", classifier)]) if CFG['do_pca'] == 3: param_grid = dict( features__pca__n_components=SP.unique( SP.round_( SP.logspace(1.0, max(SP.log2(Ytr.shape[1]), SP.log2(10)), num=min(5, Ytr.shape[1]), base=2.0))), features__univ_select__k=SP.unique( SP.round_( SP.logspace(3.0, SP.log2(Ytr.shape[1]), num=min(10, Ytr.shape[1]), base=2.0))), svm__C=[0.1, 1, 10], svm__gamma=[1e-1, 1e-2, 1e-3]) else: C_range = 10.**SP.arange(0, 2) gamma_range = 10.**SP.arange(-5, 1) param_grid = dict(features__univ_select__k=SP.unique( SP.round_( SP.logspace(3.0, SP.log2(Ytr.shape[1]), num=min(10, Ytr.shape[1]), base=2.0))), svm__C=C_range, svm__gamma=gamma_range) clf = GridSearchCV(pipeline, param_grid=param_grid, cv=5, scoring='f1') clf.fit(YtrS, lab_tr) print("The best classifier is: ", clf.best_estimator_) select_best = clf.best_estimator_.get_params( )['features__univ_select'] #names_dict[foldlabel+'_SVM']=self.geneNames[SP.argsort(-1.0*select_best.scores_)[0:(select_best.k-1)]] expected = lab_tst predicted = clf.best_estimator_.predict_proba(YtstS) if i < len(CV_list) - 1: predSVM[cv_tst, :] = predicted else: predSVM_ts = predicted #print(clf.best_estimator_) classifier = svm.SVC( kernel='rbf', gamma=0.05, class_weight='auto', probability=True) #rbf kernel for prediction param_grid = dict(C=[1, 10], gamma=[1e-1, 1e-2, 1e-3]) clf_rbf = GridSearchCV(classifier, param_grid=param_grid, cv=5, scoring='f1') clf_rbf.fit(Ytr, lab_tr) clf_rbf.best_estimator_.fit(Ytr, lab_tr) predicted = clf_rbf.best_estimator_.predict_proba(Ytst) if i < len(CV_list) - 1: predSVMrbf[cv_tst, :] = predicted else: predSVMrbf_ts = predicted #do lasso with regularisation path cs = l1_min_c(Ytr, lab_tr, loss='log') * SP.logspace(0, 3) print(" Computing regularization path ...") lasso = linear_model.LogisticRegression(C=cs[0] * 10.0, penalty='l1', tol=1e-6) param_grid = dict(C=cs) clf_lr = GridSearchCV(lasso, param_grid=param_grid, cv=5, scoring='f1') clf_lr.fit(Ytr, lab_tr) clf_lr.best_estimator_.fit(Ytr, lab_tr) lambda_best[i] = clf_lr.best_params_.get('C') predicted = clf_lr.best_estimator_.predict_proba(Ytst) clf = linear_model.LogisticRegression(C=cs[0] * 10.0, penalty='l1', tol=1e-6) coefs_ = [] for c in cs: clf.set_params(C=c) clf.fit(Ytr, lab_tr) coefs_.append(clf.coef_.ravel().copy()) if i < len(CV_list) - 1: predLR[cv_tst, :] = predicted else: predLR_ts = predicted coefs_ = SP.array(coefs_) # get ordering by importance (how many times they appear) order = (coefs_ != 0).sum(axis=0).argsort() order = order[::-1] # descending # store this order featrank_lasso = order showtop = min(Ytr.shape[1], CFG['lassotop']) clfAll = linear_model.LogisticRegression(C=1e5, penalty='l2', tol=1e-6) clfAll.fit(Ytr, lab_tr) predicted = clfAll.predict_proba(Ytst) if i < len(CV_list) - 1: predLRall[cv_tst, :] = predicted else: predLRall_ts = predicted forest = ExtraTreesClassifier(n_estimators=500, random_state=0, criterion="entropy", bootstrap=False) #forest = RandomForestClassifier(n_estimators=500, # random_state=0, criterion="entropy") forest.fit(Ytr, lab_tr) pred = forest.predict_proba(Ytst) #pdb.set_trace() if i < len(CV_list) - 1: predRF[cv_tst, :] = pred #[:,1] else: predRF_ts = pred #[:,1] importances = forest.feature_importances_ std = SP.std( [tree.feature_importances_ for tree in forest.estimators_], axis=0) topfeat = min(Ytr.shape[1], rftop) indices = SP.argsort(importances)[::-1][0:topfeat] # store full feature ranking featrank_rf = SP.argsort(importances)[::-1] # Plot the feature importances of the forest if (i == len(CV_list) - 1): PL.figure() #PL.title("Feature importances, Fold "+foldddPPlabel+', AUC='+str(SP.round_(metrics.auc(fpr, tpr),3))) PL.title("Feature importances") #PL.bar(range(topfeat), importances[indices],color="r", yerr=std[indices], align="center") PL.bar(range(topfeat), importances[indices], color="r", align="center") PL.xticks(range(topfeat), indices, rotation=70) PL.gca().set_xticklabels(var_names[indices]) PL.setp(PL.gca().get_xticklabels(), fontsize=8) PL.xlim([-1, topfeat]) PL.savefig(out_dir + '/RF_featureimportance_' + foldlabel + '.pdf') f2 = open(os.path.join(out_dir, 'classification_reportCV.txt'), 'w') predRFv = SP.argmax(predRF_ts, axis=1) + 1 predRF_trv = SP.argmax(predRF, axis=1) + 1 self.scores = predRF self.scores_tst = predRF_ts self.ranking = var_names[indices] predLRv = SP.argmax(predLR_ts, axis=1) + 1 predLR_trv = SP.argmax(predLR, axis=1) + 1 self.scoresLR = predLR self.scoresLR_tst = predLR_ts predLRallv = SP.argmax(predLRall_ts, axis=1) + 1 predLRall_trv = SP.argmax(predLRall, axis=1) + 1 self.scoresLRall = predLRall self.scoresLRall_tst = predLRall_ts if CFG['is_SVM'] == 1: predSVMv = SP.argmax(predSVM_ts, axis=1) + 1 predSVM_trv = SP.argmax(predSVM, axis=1) + 1 self.scoresSVM = predSVM self.scoresSVM_tst = predSVM_ts predSVMrbfv = SP.argmax(predSVMrbf_ts, axis=1) + 1 predSVMrbf_trv = SP.argmax(predSVMrbf, axis=1) + 1 self.scoresSVMrbf = predSVMrbf self.scoresSVMrbf_tst = predSVMrbf_ts predGNBv = SP.argmax(predGNB_ts, axis=1) + 1 predGNB_trv = SP.argmax(predGNB, axis=1) + 1 self.scoresGNB = predGNB self.scoresGNB_tst = predGNB_ts print("Classification report for classifier %s:\n%s\n" % ('Gaussian Naive Bayes', metrics.classification_report(self.labels, predGNB_trv))) print("Classification report for classifier %s:\n%s\n" % ('Random Forest', metrics.classification_report(self.labels, predRF_trv))) print("Classification report for classifier %s:\n%s\n" % ('LR', metrics.classification_report(self.labels, predLR_trv))) print("Classification report for classifier %s:\n%s\n" % ('LRall', metrics.classification_report(self.labels, predLRall_trv))) if CFG['is_RFE'] == 1: print( "Classification report for classifier %s:\n%s\n" % ('SVM ', metrics.classification_report(labels, predSVM > 0.5)), file=f2) elif CFG['is_SVM'] == 1: print("Classification report for classifier %s:\n%s\n" % ('SVM', metrics.classification_report(self.labels, predSVM_trv))) print("Classification report for classifier %s:\n%s\n" % ('SVMrbf', metrics.classification_report(self.labels, predSVMrbf_trv))) f2.close()
def fit(self, xw, xwl2, y, gs=4, model_type='logit', verbose=True): self.verbose = verbose if model_type == 'logit': clf = LogisticRegression(C=1, class_weight='balanced', penalty='l2', max_iter=300) else: #clf = SVC(kernel='linear', class_weight='balanced', C=.1,probability=False) clf = SVC(C=1., cache_size=500, kernel='linear', class_weight='balanced', probability=False) #clf = RandomForestClassifier(n_estimators=500,class_weight='balanced') ''' # wrapper feature selection rfecv = RFECV(estimator=clf, step=1, cv=StratifiedKFold(y, 3), scoring='f1')#accuracy rfecv.fit(xw, y) print("Optimal number of features : %d" % rfecv.n_features_) print("ids: {}".format((rfecv.ranking_<=5).sum())) print rfecv.grid_scores_ self.rfecv = rfecv if rfecv.support_.sum()>10: self.w_select = rfecv.support_ else: self.w_select = rfecv.ranking_<=10 ''' #xw = [:,self.w_select] #self.mask_selection = (np.ones((1,xw.shape[1]))==1)[0,:] ## Optimize the hyper parameters # Stage 1 #param_grid = dict(C=(np.array([5,3,1]))) ''' if model_type=='logit': param_grid = dict(C=(10**np.arange(1.,-2.,-0.5))) #param_grid = dict(C=(np.logspace(-.2, 1., 15))) #param_grid = dict(C=(np.arange(3,1,-0.5))) else: param_grid = dict(C=(np.arange(3.5,0.,-0.5))) param_grid = dict(C=(1.,1.00001)) #param_grid = dict(C=(np.logspace(-1.5, 0, 10))) #param_grid = dict(C=(np.arange(2.,0.5,-0.05))) #param_grid = dict(C=(np.array([0.01, 0.1, 1, 10, 100, 1000]))) gridclf = GridSearchCV(clf, param_grid=param_grid, cv=StratifiedKFold(y,n_folds=gs), n_jobs=-1,scoring='accuracy') gridclf.fit(xw,y) self.clf1 = gridclf.best_estimator_ ''' self.clf1 = clf self.clf1.fit(xw, y) if self.verbose: print self.clf1 print self.clf1.coef_ #print self.clf1.feature_importances_ #hm_y,y_pred_train = self.estimate_hitmiss(xw,y) hm_y, proba = self.suffle_hm(xw, y, gamma=.8, n_iter=100) #hm_y = self.clf1.predict(xw) print 'Stage 2' #Stage 2 min_c = l1_min_c(xwl2, hm_y, loss='log') print 'minimum c: ', min_c #clf2 = LogisticRegression(C=10**0.1,class_weight=None,penalty='l2',solver='sag') #clf2 = LogisticRegression(C=1,class_weight=None,penalty='l2',solver='sag',max_iter=300) #clf2 = LinearSVC(class_weight='balanced',penalty='l1',dual=False) clf2 = LogisticRegression(C=1., class_weight='balanced', penalty='l2', solver='liblinear', max_iter=300) #clf2 = LogisticRegression(C=1.,class_weight='balanced',penalty='l2',solver='sag',max_iter=300) #clf2 = SVC(C=1.,cache_size=500,kernel='linear',class_weight='balanced') #clf2 = RandomForestClassifier(n_estimators=500,class_weight='balanced',oob_score=True) #param_grid = dict(C=(10**np.arange(1.,-2.,-0.5))) #param_grid = dict(C=(np.arange(3,1,-0.5))) #param_grid = dict(C=(np.logspace(-0.5, 2., 30))) #param_grid = dict(C=(np.logspace(1., 1.6, 30))) param_grid = dict(C=(np.logspace(-.2, 1., 15))) #param_grid = dict(C=(np.logspace(-.15, 1., 15))) #param_grid = dict(C=(np.logspace(np.log10(min_c), 0., 15))) #param_grid = dict(C=(1,1.0001)) # 2 levels balancing ''' new_classes = np.zeros_like(y) new_classes[(y==0) & (hm_y==0)]=0 new_classes[(y==1) & (hm_y==0)]=1 new_classes[(y==0) & (hm_y==1)]=2 new_classes[(y==1) & (hm_y==1)]=3 tmp_samp_w = len(new_classes) / (len(np.unique(new_classes))*1. * np.bincount(new_classes)) tmp_samp_w = (1.*(tmp_samp_w/tmp_samp_w.sum())) sample_w = new_classes.copy().astype(float) sample_w[new_classes==0] = tmp_samp_w[0] sample_w[new_classes==1] = tmp_samp_w[1] sample_w[new_classes==2] = tmp_samp_w[2] sample_w[new_classes==3] = tmp_samp_w[3] ''' #gridclf = GridSearchCV(clf2, param_grid=param_grid, cv=StratifiedKFold(hm_y,n_folds=gs),fit_params=dict(sample_weight=sample_w), n_jobs=-1,scoring='accuracy') #gridclf = GridSearchCV(clf2, param_grid=param_grid, cv=StratifiedKFold(hm_y,n_folds=gs),fit_params=dict(sample_weight=proba), n_jobs=-1,scoring='accuracy') gridclf = GridSearchCV(clf2, param_grid=param_grid, cv=StratifiedKFold(hm_y, n_folds=gs), n_jobs=-1, scoring='accuracy') gridclf.fit(xwl2, hm_y) clf2 = gridclf.best_estimator_ #clf2.fit(xwl2,hm_y) print 'class order : ', clf2.classes_ # train right classifier on easy cases #self.clf_easy = SVC(C=1.,cache_size=500,kernel='linear',class_weight='balanced',probability=False) #l2_pred = clf2.predict(xwl2) #self.clf_easy.fit(xw[l2_pred>0,:],y[l2_pred>0]) if self.verbose: print clf2 print clf2.coef_ self.clf2 = clf2
def grid_search_lr_c(X_train, y_train, df_coef_path=False, pic_coefpath_title='Logistic Regression Path', pic_coefpath=False, pic_performance_title='Logistic Regression Performance', pic_performance=False): """ grid search optimal hyper parameters c with the best ks performance :param X_train: features dataframe :param y_train: target :param df_coef_path: the file path for logistic regression coefficient dataframe :param pic_coefpath_title: the pic title for coefficient path picture :param pic_coefpath: the file path for coefficient path picture :param pic_performance_title: the pic title for ks performance picture :param pic_performance: the file path for ks performance picture :return: a tuple of c and ks value with the best ks performance """ # init a LogisticRegression model clf_l1_LR = LogisticRegression(C=0.1, penalty='l1', tol=0.01, class_weight='balanced') cs = l1_min_c(X_train, y_train, loss='log') * np.logspace(0, 3) print("Computing regularization path ...") start = datetime.now() print start coefs_ = [] ks = [] for c in cs: clf_l1_LR.set_params(C=c) clf_l1_LR.fit(X_train, y_train) coefs_.append(clf_l1_LR.coef_.ravel().copy()) proba = clf_l1_LR.predict_proba(X_train)[:, 1] ks.append(compute_ks(proba, y_train)) end = datetime.now() print end print("This took ", end - start) coef_cv_df = pd.DataFrame(coefs_, columns=X_train.columns) coef_cv_df['ks'] = ks coef_cv_df['c'] = cs if df_coef_path: file_name = df_coef_path if isinstance(df_coef_path, str) else None coef_cv_df.to_csv(file_name) coefs_ = np.array(coefs_) fig1 = plt.figure('fig1') plt.plot(np.log10(cs), coefs_) ymin, ymax = plt.ylim() plt.xlabel('log(C)') plt.ylabel('Coefficients') plt.title(pic_coefpath_title) plt.axis('tight') if pic_coefpath: file_name = pic_coefpath if isinstance(pic_coefpath, str) else None plt.savefig(file_name) else: plt.show() fig2 = plt.figure('fig2') plt.plot(np.log10(cs), ks) plt.xlabel('log(C)') plt.ylabel('ks score') plt.title(pic_performance_title) plt.axis('tight') if pic_performance: file_name = pic_performance if isinstance(pic_performance, str) else None plt.savefig(file_name) else: plt.show() flag = coefs_ < 0 idx = np.array(ks)[flag.sum(axis=1) == 0].argmax() return (cs[idx], ks[idx])
from sklearn import datasets from sklearn.svm import l1_min_c iris = datasets.load_iris() X = iris.data y = iris.target X = X[y != 2] y = y[y != 2] X -= np.mean(X, 0) ################################################################################ # Demo path functions cs = l1_min_c(X, y, loss='log') * np.logspace(0, 3) print "Computing regularization path ..." start = datetime.now() clf = linear_model.LogisticRegression(C=1.0, penalty='l1', tol=1e-6) coefs_ = [clf.fit(X, y, C=c).coef_.ravel().copy() for c in cs] print "This took ", datetime.now() - start coefs_ = np.array(coefs_) pl.plot(np.log10(cs), coefs_) ymin, ymax = pl.ylim() pl.xlabel('log(C)') pl.ylabel('Coefficients') pl.title('Logistic Regression Path') pl.axis('tight')
def logisticL1NestedCV(df, outcomeVar, predVars, nFolds=10, LPO=None, Cs=10, n_jobs=1, scorer='log_loss'): """Apply logistic regression with L1-regularization (LASSO) to df. Uses nested cross-validation framework with inner folds to optimize C and outer test folds to evaluate performance. Parameters ---------- df : pd.DataFrame Must contain outcome and predictor variables. outcomeVar : str predVars : ndarray or list Predictor variables in the model. nFolds : int N-fold stratified cross-validation LPO : int or None Use Leave-P-Out cross-validation instead of StratifiedNFoldCV Cs : int or list Each of the values in Cs describes the inverse of regularization strength. If Cs is as an int, then a grid of Cs values are chosen in a logarithmic scale between 1e-4 and 1e4. Smaller values specify stronger regularization. Returns ------- results : dict Contains results as keys below: fpr: (100, ) average FPR for ROC tpr: (100, ) average TPR for ROC AUC: (outerFolds, ) AUC of ROC for each outer test fold meanAUC: (1, ) AUC of the average ROC ACC: (outerFolds, ) accuracy across outer test folds scores: (outerFolds, innerFolds, Cs) log-likelihood for each C across inner and outer CV folds optimalCs: (outerFolds, ) optimal C from each set of inner CV finalResult: final fitted model with predict() exposed prob: (N,) pd.Series of predicted probabilities avg over outer folds varList: (Nvars, ) list of vars with non-zero coef in final model Cs: (Cs, ) pre-specified grid of Cs coefs: (outerFolds, predVars) refit with optimalC in each fold paths: (outerFolds, Cs, predVars + intercept) avg across inner folds XVars: list of all vars in X yVar: name of outcome variable N: total number of rows/instances in the model""" if not isinstance(predVars, list): predVars = list(predVars) tmp = df[[outcomeVar] + predVars].dropna() X, y = tmp[predVars].astype(float), tmp[outcomeVar].astype(float) if np.isscalar(Cs): """From sklearn example: https://scikit-learn.org/stable/auto_examples/linear_model/plot_logistic_path.html""" Cs = l1_min_c(X, y, loss='log') * np.logspace(0, 7, Cs) elif Cs is None: Cs = l1_min_c(X, y, loss='log') * np.logspace(0, 7, 10) if LPO is None: innerCV = StratifiedKFold(n_splits=nFolds, shuffle=True) outerCV = StratifiedKFold(n_splits=nFolds, shuffle=True) else: innerCV = LeavePOut(LPO) outerCV = LeavePOut(LPO) scorerFunc = sklearn.metrics.make_scorer(sklearn.metrics.log_loss, greater_is_better=False, needs_proba=True, needs_threshold=False, labels=[0, 1]) fpr = np.linspace(0, 1, 100) tpr = np.nan * np.zeros((fpr.shape[0], nFolds)) acc = np.nan * np.zeros(nFolds) auc = np.nan * np.zeros(nFolds) paths = [] coefs = [] probs = [] optimalCs = np.nan * np.zeros(nFolds) scores = [] for outi, (trainInd, testInd) in enumerate(outerCV.split(X=X, y=y)): Xtrain, Xtest = X.iloc[trainInd], X.iloc[testInd] ytrain, ytest = y.iloc[trainInd], y.iloc[testInd] model = sklearn.linear_model.LogisticRegressionCV(Cs=Cs, cv=innerCV, penalty='l1', solver='liblinear', scoring=scorerFunc, refit=True, n_jobs=n_jobs) """With refit = True, the scores are averaged across all folds, and the coefs and the C that corresponds to the best score is taken, and a final refit is done using these parameters.""" results = model.fit(X=Xtrain, y=ytrain) prob = results.predict_proba(Xtest) class1Ind = np.nonzero(results.classes_ == 1)[0][0] fprTest, tprTest, _ = sklearn.metrics.roc_curve( ytest, prob[:, class1Ind]) tpr[:, outi] = np.interp(fpr, fprTest, tprTest) auc[outi] = sklearn.metrics.auc(fprTest, tprTest) acc[outi] = sklearn.metrics.accuracy_score(ytest, np.round(prob[:, class1Ind]), normalize=True) optimalCs[outi] = results.C_[0] scores.append(results.scores_[1]) paths.append(results.coefs_paths_[1]) coefs.append(results.coef_) probs.append(pd.Series(prob[:, class1Ind], index=Xtest.index)) meanTPR = np.mean(tpr, axis=1) meanTPR[0], meanTPR[-1] = 0, 1 meanACC = np.mean(acc) meanAUC = sklearn.metrics.auc(fpr, meanTPR) meanC = 10**np.mean(np.log10(optimalCs)) paths = np.concatenate([p.mean(axis=0, keepdims=True) for p in paths], axis=0) scores = np.concatenate([s[None, :, :] for s in scores], axis=0) """Compute mean probability over test predictions in CV""" probS = pd.concat(probs).groupby(level=0).agg(np.mean) probS.name = 'Prob' """Refit all the data with the optimal C for variable selection and classification of holdout data""" model = sklearn.linear_model.LogisticRegression(C=meanC, penalty='l1', solver='liblinear') result = model.fit(X=X, y=y) varList = np.array(predVars)[result.coef_.ravel() != 0].tolist() rocRes = rocStats(y, np.round(probS)) outD = { 'fpr': fpr, # (100, ) average FPR for ROC 'tpr': meanTPR, # (100, ) average TPR for ROC 'AUC': auc, # (outerFolds, ) AUC of ROC for each outer test fold 'mAUC': meanAUC, # (1, ) AUC of the average ROC 'ACC': acc, # (outerFolds, ) accuracy across outer test folds 'mACC': np.mean(acc), 'scores': scores, # (outerFolds, innerFolds, Cs) score for each C across inner and outer CV folds 'scorer': scorer, 'optimalCs': optimalCs, # (outerFolds, ) optimal C from each set of inner CV 'C': meanC, 'finalResult': result, # final fitted model with predict() exposed 'prob': probS, # (N,) pd.Series of predicted probabilities avg over outer folds 'varList': varList, # list of vars with non-zero coef in final model 'Cs': Cs, # pre-specified grid of Cs 'coefs': np.concatenate( coefs), # (outerFolds, predVars) refit with optimalC in each fold 'paths': paths, # (outerFolds, Cs, predVars + intercept) avg across inner folds 'Xvars': predVars, 'Yvar': outcomeVar, 'N': tmp.shape[0] } outD.update(rocRes[['Sensitivity', 'Specificity']].to_dict()) return outD
from sklearn import linear_model from sklearn import datasets from sklearn.svm import l1_min_c iris = datasets.load_iris() X = iris.data y = iris.target X = X[y != 2] y = y[y != 2] X -= np.mean(X, 0) # ############################################################################# # Demo path functions cs = l1_min_c(X, y, loss='log') * np.logspace(0, 3) print("Computing regularization path ...") start = datetime.now() clf = linear_model.LogisticRegression(penalty='l1', tol=1e-6) coefs_ = [] for c in cs: clf.set_params(C=c) clf.fit(X, y) coefs_.append(clf.coef_.ravel().copy()) print("This took ", datetime.now() - start) coefs_ = np.array(coefs_)
def reg(): i = datasets.make_classification(n_samples=100, n_features=2, n_informative=1, n_redundant=0, n_repeated=0, n_classes=2, n_clusters_per_class=1, weights=None, flip_y=0.01, class_sep=1.0, hypercube=True, shift=0.0, scale=1.0, shuffle=True, random_state=None) j = datasets.make_classification(n_samples=100, n_features=2, n_informative=2, n_redundant=0, n_repeated=0, n_classes=4, n_clusters_per_class=1, weights=None, flip_y=0.01, class_sep=1.0, hypercube=True, shift=0.0, scale=1.0, shuffle=True, random_state=None) k = datasets.make_classification(n_samples=100, n_features=200, n_informative=2, n_redundant=2, n_repeated=0, n_classes=2, n_clusters_per_class=2, weights=None, flip_y=0.01, class_sep=1.0, hypercube=True, shift=0.0, scale=1.0, shuffle=True, random_state=None) ########################################################################## ############## Dataset A ############################# ########################################################################## X = i[0] y = i[1] X = X[y != 2] y = y[y != 2] X -= np.mean(X, 0) cs = l1_min_c(X, y, loss='log') * np.logspace(0, 3) print "Computing regularization path 2D, 2 classes..." start = datetime.now() clf = LogisticRegression(C=1.0, penalty='l1', tol=1e-6) coefs_ = [] for c in cs: clf.set_params(C=c) clf.fit(X, y) coefs_.append(clf.coef_.ravel().copy()) print "This took ", datetime.now() - start pl.figure() coefs_ = np.array(coefs_) pl.plot(np.log10(cs), coefs_) ymin, ymax = pl.ylim() pl.xlabel('log(C)') pl.ylabel('Coefficients') pl.title('Logistic Regression Path 2D, 2-Classes') pl.axis('tight') ########################################################################## ############## Dataset B ############################# ########################################################################## X = j[0] y = j[1] X = X[y != 2] y = y[y != 2] X -= np.mean(X, 0) cs = l1_min_c(X, y, loss='log') * np.logspace(0, 3) print "Computing regularization path 2D, 4 classes..." start = datetime.now() clf = LogisticRegression(C=1.0, penalty='l1', tol=1e-6) coefs_ = [] for c in cs: clf.set_params(C=c) clf.fit(X, y) coefs_.append(clf.coef_.ravel().copy()) print "This took ", datetime.now() - start pl.figure() coefs_ = np.array(coefs_) pl.plot(np.log10(cs), coefs_) ymin, ymax = pl.ylim() pl.xlabel('log(C)') pl.ylabel('Coefficients') pl.title('Logistic Regression Path 2D, 4 classes') pl.axis('tight') ########################################################################## ############## Dataset C ############################# ########################################################################## ''''X = k[0]