def get_model(basis, response, time_series_cv, splits): if time_series_cv: cv = TimeSeriesSplit(n_splits=splits) else: cv = KFold(n_splits=splits) model = ElasticNetCV(l1_ratio=1, selection='random', cv=cv) with warnings.catch_warnings(): warnings.simplefilter('ignore') model.fit(basis, response) _, coefs, _ = model.path(basis, response, l1_ration=model.l1_ratio_, alphas=model.alphas_) return model, coefs, model.mse_path_
def _build_linear_model(self, basis, y): if self.time_series_cv: cv = TimeSeriesSplit(n_splits=self.splits) else: cv = KFold(n_splits=self.splits, random_state=self.seed, shuffle = True) if self.method == REGRESSION: #model = XGBRegressor(objective='reg:squarederror',booster='gbtree') model = ElasticNetCV(l1_ratio=0.1, selection='random', cv=cv, random_state=self.seed, normalize=False) else: model = LogisticRegressionCV(penalty='l1', cv=cv) with warnings.catch_warnings(): warnings.simplefilter('ignore') model.fit(basis, y) #coefs = model.feature_importances_ _, coefs, _ = model.path(basis, y, l1_ration=model.l1_ratio_, alphas=model.alphas_) return model, coefs, model.mse_path_
def _build_linear_model(self, basis, y): if self.time_series_cv: cv = TimeSeriesSplit(n_splits=self.splits) else: cv = KFold(n_splits=self.splits, random_state=self.seed) if self.method == REGRESSION: model = ElasticNetCV(l1_ratio=1, selection='random', cv=cv, random_state=self.seed, normalize=False) else: model = LogisticRegressionCV(penalty='l1', cv=cv) with warnings.catch_warnings(): warnings.simplefilter('ignore') model.fit(basis, y) _, coefs, _ = model.path(basis, y, l1_ration=model.l1_ratio_, alphas=model.alphas_) return model, coefs, model.mse_path_
def eNetModel(data, labels, featureNames, texts, documents, nFolds): # run SVM with grid search for parameters and leave-one-out cross validation kf = KFold(len(texts), n_folds=nFolds) acc = 0 mean_coefs = [] for train, test in kf: # test_docs = {} label_train = labels[train] #selected_feats = getSelectedFeatures(train, test, texts, featureNames, documents, label_train, nFeats) full_train_data, full_test_data, label_train, label_test = data[train], data[test], labels[train], labels[test] #data_train = sortBySelected(full_train_data, selected_feats, featureNames) #data_test = sortBySelected(full_test_data, selected_feats, featureNames) data_train = full_train_data data_test = full_test_data enet = ElasticNetCV(l1_ratio=[.1, .5, .7, .9, .95, .99, 1],n_alphas=1000,alphas=[0.0125, 0.025, 0.05, .125, .25, .5, 1., 2., 4.]) enet.fit(data_train, label_train) data_train = np.asarray(data_train,dtype=float) label_train = np.asarray(label_train,dtype=float) vals = enet.path(data_train, label_train) mean_coefs.append(np.mean(vals[1],axis=1)) if label_test == 1 and enet.predict(data_test) > 0.5: acc += 1 elif label_test == 0 and enet.predict(data_test) < 0.5: acc += 1 if len(mean_coefs) % 10 == 0: print str(len(mean_coefs)), 'out of %s subs finished' %(str(len(data))) mean_coefs = np.mean(np.array(mean_coefs), axis=0) return Decimal(acc)/Decimal(len(data)), mean_coefs
expr_TF = expr_all.loc[only_TF_list] #Form a dataframe of gene x TF for pvalue_gt. This DF will be row-sorted depending on the cancer pvalue_gt_array = (-1) * np.ones( (len(only_gene_list), len(only_TF_list))) #A gene x TF matrix X_features = expr_TF.values.T start_time = time.clock() for i in range(len(only_gene_list)): print('Pvalue_gene_TF', i) y = expr_gene.iloc[i].values EN_model = ElasticNetCV(l1_ratio=l1_rat) ####make sure that number of nonzero coefs do not exceed max_num_coefs alphas1, coefs1, _ = EN_model.path(X_features, y, eps=0.01, n_alphas=10) num_coefs = np.sum(coefs1 != 0, axis=0) #print(num_coefs) #print(num_coefs[num_coefs <= max_num_coefs][-1]) rep_EN = 0 if num_coefs[-1] < max_num_coefs: EN_coef = coefs1[:, -1] selected_ind = np.array(range(len(only_TF_list)))[EN_coef != 0] else: while (num_coefs[0] != num_coefs[-1]) and (max(num_coefs[num_coefs <= max_num_coefs]) != max_num_coefs) and (rep_EN < 10): rep_EN += 1 alpha_min = alphas1[(num_coefs <= max_num_coefs)][-1] alpha_max = alphas1[(num_coefs > max_num_coefs)][0] alphas3 = np.linspace(alpha_min, alpha_max, 10)
en_cv = ElasticNetCV(fit_intercept=True, n_alphas=100, normalize=False, l1_ratio=0.01) en_cv.fit(x_train, y_train) p_en_cv = en_cv.predict(x_test) print("#--------------------------------------") print("ElasticNetCV regression result") print("r2_score = %.4f" % (r2_score(y_test, p_en_cv))) print("mean squared error = %.4f" % (mean_squared_error(y_test, p_en_cv))) print(sum(en_cv.coef_ == 0)) print("#--------------------------------------") print("\n") enp = en_cv.path(x_train, y_train) enp_fig = plt.figure() enp_plot = enp_fig.add_subplot(1, 1, 1) enp_plot.plot(np.log(enp[0]), enp[1].T) enp_plot.set_xlabel("lambda vale (log scale)") enp_plot.set_ylabel("Coefficient estimate value") enp_plot.set_title("EN solution path") plt.savefig("en_path.jpg", dpi=300) plt.show() # Update results res_df.loc[2, :] = [mean_squared_error(y_test, p_lasso_cv), 'lasso'] res_df.loc[3, :] = [mean_squared_error(y_test, p_en_cv), 'ElasticNetCV'] print("#--------------------------------------") print("Update results")