def regression_NumMosquitos(Xtr, ytr, Xte): from sklearn.linear_model import ElasticNetCV #model_nm = ElasticNetCV(l1_ratio=[.1, .5, .7, .9, .95, .99, 1], max_iter=10000, cv=4) model_nm = ElasticNetCV() model_nm.fit(Xtr, ytr) results_nm = model_nm.predict(Xte) return results_nm
def learn_for(reviews, i): reg = ElasticNetCV(fit_intercept=True, alphas=[ 0.0125, 0.025, 0.05, .125, .25, .5, 1., 2., 4.]) u = reviews[i] us = range(reviews.shape[0]) del us[i] ps, = np.where(u.toarray().ravel() > 0) x = reviews[us][:, ps].T y = u.data kf = KFold(len(y), n_folds=4) predictions = np.zeros(len(ps)) for train, test in kf: xc = x[train].copy().toarray() x1 = np.array([xi[xi > 0].mean() for xi in xc]) x1 = np.nan_to_num(x1) for i in xrange(xc.shape[0]): xc[i] -= (xc[i] > 0) * x1[i] reg.fit(xc, y[train] - x1) xc = x[test].copy().toarray() x1 = np.array([xi[xi > 0].mean() for xi in xc]) x1 = np.nan_to_num(x1) for i in xrange(xc.shape[0]): xc[i] -= (xc[i] > 0) * x1[i] p = np.array(map(reg.predict, xc)).ravel() predictions[test] = p return predictions
def learn_for(self, i): reviews = AbstractEstimateBase.reviews reg = ElasticNetCV(fit_intercept=True, alphas=[ 0.0125, 0.025, 0.05, .125, .25, .5, 1., 2., 4.]) nusers,nmovies = reviews.shape u = reviews[i] us = np.arange(reviews.shape[0]) us = np.delete(us, i) ps, = np.where(u.ravel() > 0) x = reviews[us][:, ps].T kf = KFold(len(ps), n_folds=4) predictions = np.zeros(len(ps)) for train, test in kf: xc = x[train].copy() x1 = np.array([xi[xi > 0].mean() for xi in xc]) x1 = np.nan_to_num(x1) for i in range(xc.shape[0]): xc[i] -= (xc[i] > 0) * x1[i] reg.fit(xc, u[train] - x1) xc = x[test].copy() x1 = np.array([xi[xi > 0].mean() for xi in xc]) x1 = np.nan_to_num(x1) for i in range(xc.shape[0]): xc[i] -= (xc[i] > 0) * x1[i] p = reg.predict(xc).ravel() predictions[test] = p fill_preds = np.zeros(nmovies) fill_preds[ps] = predictions return fill_preds
def enetCV(): print ("Doing elastic net") cross_val = cross_validation.ShuffleSplit(len(base_X), n_iter=5, test_size=0.2, random_state=0) clf4 = ElasticNetCV(cv=cross_val) clf4.fit(base_X, base_Y) print ("Score = %f" % clf4.score(base_X, base_Y)) clf4_pred = clf4.predict(X_test) write_to_file("elasticCV.csv", clf4_pred)
def train_model(data, target, n_iter, rate): """Bootstraps, trains ElasticNetCV model, selects features, and trains final linear regression model. Returns model and selected features. """ coefs = [] for i in range(n_iter): print "bootstrap iter {}".format(i) indices = np.random.choice(len(data), size=len(data), replace=True) sample_data = data[indices] sample_target = target[indices] model = ElasticNetCV(l1_ratio=[.1, .5, .7, .9, .95, .99, 1], max_iter=10000, n_jobs=4) model.fit(sample_data, sample_target) coefs.append(model.coef_) coefs = np.vstack(coefs) rate_selected = make_rates(coefs) selected1 = np.nonzero(rate_selected >= rate)[0] selected2 = np.argsort(rate_selected)[-50:] selected = selected1 if len(selected1) < len(selected2) else selected2 model = LinearRegression() model.fit(data[:, selected], target) model_full = ElasticNetCV(l1_ratio=[.1, .5, .7, .9, .95, .99, 1], max_iter=10000, n_jobs=4) model_full.fit(data, target) return model_full, model, selected, coefs
def elastic_net_cv(self, drug_name, l1_ratio=0.5, alphas=None, n_folds=10): # Get the data for the requested drug xscaled, Y = self._get_one_drug_data(drug_name) en = ElasticNetCV(l1_ratio=l1_ratio, alphas=alphas, cv=n_folds) encv = en.fit(xscaled, Y) self.encv = encv print("Best alpha on %s folds : %s" % (n_folds, encv.alpha_)) #df.sort_values().plot(kind='bar') return encv.alpha_
def elasticNet(argv): data = pd.read_csv(argv, index_col=0) y = data['target'] X = data.drop('target', axis=1) featureNames = X.columns.values enet = ElasticNetCV(n_jobs=-1, normalize=True) enet.fit(X, y) dropIdx = featureNames[enet.coef_ < 1e-5] print "Elastic Net drop: %d" % len(dropIdx) print dropIdx data.drop(dropIdx, axis=1, inplace=True) data.to_csv(argv+'.enet.csv') return enet
def run(self): allcomments = self._aggregateComments(self.data) self._buildDictionary(allcomments) # create representation of documents tfidfArray = self.vectorizer.transform(allcomments) # create labelling labels = [] for datum in self.data: labels.append(len(datum.meta()['favorites'])) labels = np.array(labels) print self.vectorizer.get_params() print self.vectorizer.get_feature_names() # training self.elasticNet = ElasticNetCV(alphas=self._alpha, l1_ratio=self._l1_ratio, fit_intercept=True, normalize=False, precompute='auto', max_iter=1000, copy_X=True, tol=0.0001, rho=None, cv=self._n_folds) self.elasticNet.fit(tfidfArray,labels) for i,l1_ratio in enumerate(self._l1_ratio): for j,alpha in enumerate(self._alpha): print "alpha: %f, l1_ratio: %f --> %f" % (alpha,l1_ratio,np.mean(self.elasticNet.mse_path_[i,j,:])) print self.vectorizer.inverse_transform(self.elasticNet.coef_)
def LCCB_coevo(fitness_fn, pop): y = fitness_fn.train_y # Make a new array composed of pop[i].semantics for all i # (pop[i].semantics has already been calculated) X = None for ind in pop: if (ind.phenotype and ind.fitness != sys.maxint and all(np.isfinite(ind.semantics))): col = ind.semantics else: print("Omitting a column") col = np.zeros(len(y)) if X is None: X = col else: X = np.c_[X, col] eps = 5e-3 # FIXME FFX processes the data so that has zero mean and unit # variance before applying the LR... should we do that? # Use ElasticNet with cross-validation, which will automatically # get a good value for regularisation model = ElasticNetCV() model.fit(X, y) coefs = model.coef_ output = model.predict(X) rmse = fitness_fn.rmse(y, output) print("rmse", rmse) # Assign the magnitude of coefficients as individual fitness # values. Have to construct a new individual because tuples are # immutable. FIXME this is not a great method -- it's likely that # the population will converge on one or a few basis functions, # and then the performance of the ENet will decrease because there # won't be enough independent basis functions to work with. pop = [variga.Individual(genome=pop[i].genome, used_codons=pop[i].used_codons, fitness=-abs(coefs[i]), phenotype=pop[i].phenotype, readable_phenotype=pop[i].readable_phenotype, semantics=pop[i].semantics) for i in range(len(pop))] pop.sort(key=variga.ind_compare)
def predict(train): binary = (train > 0) reg = ElasticNetCV(fit_intercept=True, alphas=[ 0.0125, 0.025, 0.05, .125, .25, .5, 1., 2., 4.]) norm = NormalizePositive() train = norm.fit_transform(train) filled = train.copy() # 모든 사용자에 대해 반복 for u in range(train.shape[0]): # 훈련에서 현재 사용자 제거 curtrain = np.delete(train, u, axis=0) bu = binary[u] if np.sum(bu) > 5: reg.fit(curtrain[:,bu].T, train[u, bu]) # 이전에 없는 값을 넣는다 filled[u, ~bu] = reg.predict(curtrain[:,~bu].T) return norm.inverse_transform(filled)
def predict(train): binary = (train > 0) reg = ElasticNetCV(fit_intercept=True, alphas=[ 0.0125, 0.025, 0.05, .125, .25, .5, 1., 2., 4.]) norm = NormalizePositive() train = norm.fit_transform(train) filled = train.copy() # iterate over all users for u in range(train.shape[0]): # remove the current user for training curtrain = np.delete(train, u, axis=0) bu = binary[u] if np.sum(bu) > 5: reg.fit(curtrain[:,bu].T, train[u, bu]) # Fill the values that were not there already filled[u, ~bu] = reg.predict(curtrain[:,~bu].T) return norm.inverse_transform(filled)
def regress(x, y, title): clf = ElasticNetCV(max_iter=200, cv=10, l1_ratio = [.1, .5, .7, .9, .95, .99, 1]) clf.fit(x, y) print "Score", clf.score(x, y) pred = clf.predict(x) plt.title("Scatter plot of prediction and " + title) plt.xlabel("Prediction") plt.ylabel("Target") plt.scatter(y, pred) # Show perfect fit line if "Boston" in title: plt.plot(y, y, label="Perfect Fit") plt.legend() plt.grid(True) plt.show()
def enet_granger_causality_cv(X_t, y_t, cv, alphas, top_num=None, top_perc=4,max_iter=100, lambdas=None): # alph ais the l1_ratio if lambdas != None: use_lambdas = np.tile(lambdas, len(alphas)).reshape(len(alphas), len(lambdas)) enet = ElasticNetCV(l1_ratio=alphas, alphas=use_lambdas, cv=cv, max_iter=max_iter) fit = enet.fit(X_t, y_t) use_lambdas = fit.alphas_ use_lambdas = np.tile(use_lambdas, len(alphas)).reshape(len(alphas), len(lambdas)) print "Used lambdas" print use_lambdas else: enet = ElasticNetCV(l1_ratio=alphas, cv=cv, max_iter=max_iter) fit = enet.fit(X_t, y_t) use_lambdas = fit.alphas_ # lambdas is a matrix cv_mses = enet.mse_path_.sum(axis=2).flatten() cv_alphas = np.repeat(alphas, use_lambdas.shape[1]) cv_lambdas = use_lambdas.flatten() if top_num == None: print "Num cv alphas: ", len(cv_alphas) top_num = int(len(cv_alphas) * top_perc / 100.0) print "Top num ", top_num # this will keep the smallest top_indices, top_mses = get_min_k(cv_mses, top_num) top_lambdas = cv_lambdas[top_indices] top_alphas = cv_alphas[top_indices] top_df = pd.DataFrame(data={"lambda.min": top_lambdas, "alpha": top_alphas, "error.min": top_mses}) return top_df
def elasticNetRegNT(self, X, Y, nCV, l1_weights=None): """Run elastic net with the given params :param X: design matrix :param Y: true labels :param nCV: number of CVs :param l1_weights: weights of the lasso term :return: """ # very difficult to choose alpha, better use CV # enet = ElasticNet(alpha=self.alpha, l1_ratio=0.8, fit_intercept=False) # enet = ElasticNetCV(fit_intercept=False, cv=nCV) if (self.useCV): enet = ElasticNetCV(cv=nCV, max_iter=self.maxItr, l1_weights=l1_weights, fit_intercept=self.fit_intercept, alphas=self.alphas, l1_ratio=self.l1_ratio) enet.fit(X, Y) self.cv_alpha = enet.alpha_ else: enet = ElasticNet(alpha=self.alpha, l1_ratio=self.l1_ratio, max_iter=self.maxItr, l1_weights=l1_weights) enet.fit(X, Y) if self.verbose: print("Num of iter: %d"%enet.n_iter_) # print("Best alpha: {}, l1_ratio: {}" # .format(enet.alpha_, enet.l1_ratio_)) # print(enet.get_params()) ## plot regulation path for testing # testReg.lassoElasticnetPaths(X, Y) return enet.coef_, enet.intercept_
def eNetModel(data, labels, featureNames, texts, documents, nFolds): # run SVM with grid search for parameters and leave-one-out cross validation kf = KFold(len(texts), n_folds=nFolds) acc = 0 mean_coefs = [] for train, test in kf: # test_docs = {} label_train = labels[train] #selected_feats = getSelectedFeatures(train, test, texts, featureNames, documents, label_train, nFeats) full_train_data, full_test_data, label_train, label_test = data[train], data[test], labels[train], labels[test] #data_train = sortBySelected(full_train_data, selected_feats, featureNames) #data_test = sortBySelected(full_test_data, selected_feats, featureNames) data_train = full_train_data data_test = full_test_data enet = ElasticNetCV(l1_ratio=[.1, .5, .7, .9, .95, .99, 1],n_alphas=1000,alphas=[0.0125, 0.025, 0.05, .125, .25, .5, 1., 2., 4.]) enet.fit(data_train, label_train) data_train = np.asarray(data_train,dtype=float) label_train = np.asarray(label_train,dtype=float) vals = enet.path(data_train, label_train) mean_coefs.append(np.mean(vals[1],axis=1)) if label_test == 1 and enet.predict(data_test) > 0.5: acc += 1 elif label_test == 0 and enet.predict(data_test) < 0.5: acc += 1 if len(mean_coefs) % 10 == 0: print str(len(mean_coefs)), 'out of %s subs finished' %(str(len(data))) mean_coefs = np.mean(np.array(mean_coefs), axis=0) return Decimal(acc)/Decimal(len(data)), mean_coefs
# Scale our Data with Robust Scaler to minimise outlier influence (Approx 4% of the data are significant outliers as measured by Cook's Distance) rb_scaler = RobustScaler() X_scaled = pd.DataFrame(rb_scaler.fit_transform(X), columns=X.columns) std_scaler = StandardScaler() X_standard = pd.DataFrame(std_scaler.fit_transform(X), columns=X.columns) ## Define CV Root Mean Square Error ## def cv_rmse(estimator, X, y, cv=5): rmse = np.mean(np.sqrt(-cross_val_score(estimator, X, y, cv=cv, scoring="neg_mean_squared_error"))) return rmse ## Regression Models ## # Elastic Net Regressor elastic_reg = ElasticNetCV(cv=5, max_iter=15000) # Lasso Model for Comparison lasso_reg = LassoCV(cv=5, alphas=[0.011], max_iter=15000) # Previously Optimised ## Model Evaluation & Hyperparameter Tuning ## # CV Root Mean Squared Error on Training Set (Robust Scaled) cv_rmse(lasso_reg, X_scaled, np.ravel(y)) # LASSO: 0.319 cv_rmse(elastic_reg, X_scaled, np.ravel(y)) # Elastic Net (ratio = 0.5): 0.317 # CV Root Mean Squared Error on Training Set (Standardised) cv_rmse(lasso_reg, X_standard, np.ravel(y)) # LASSO: 0.2992 cv_rmse(elastic_reg, X_standard, np.ravel(y)) # Elastic Net (ratio = 0.5): 0.3012 # Alpha Selection alphas = np.logspace(-10, 1, 400)
l = [] with h5py.File("ECoG_big_data.h5", "r+") as f1: with h5py.File("selected.h5", "r+") as f2: for i in range(1, 4): sid = "sub" + str(i) X = f1[sid]["train_data"][:] Y = f1[sid]["train_clabel"][:] Yb = f1[sid]["train_blabel"][:] Xt = f1[sid]["test_data"][:] Yt = f1[sid]["test_clabel"][:] Ytb = f1[sid]["test_blabel"][:] for finger in range(5): for method in ["l1", "mcp", "scad"]: idxc = f2[sid]["finger" + str(finger + 1)][method][:] - 1 idxb = f2[sid]["finger" + str(finger + 1)]["l1_l"][:] - 1 en = ElasticNetCV() en.fit(X[:, idxc].astype("float64"), Y[:, finger]) yp = en.predict(Xt[:, idxc]) corr = np.corrcoef(yp, Yt[:, finger])[0, 1] if corr < 0.3: break else: l.append([sid + "//" + "finger" + str(finger + 1), corr]) lr = LogisticRegressionCV() lr.fit(X[:, idxc], Yb[:, finger]) tp = yp * fun(lr.predict(Xt[:, idxc])) m = np.where(np.convolve(tp, np.ones((40,)) / 40, mode="same") < 0.5, 0, 1) b, a = butter(2, 9.0 / 25, "low") yy = relu(filtfilt(b, a, tp * m)) print corr, np.corrcoef(Yt[:, finger], yy)[0, 1]
def main(): path = '../data/states' data = pd.read_csv(f'{path}/state_mean_accident_data.csv', header=0, index_col='state_name') non_feature_cols = [ 'state_number', 'state_code', 'accidents', 'fatalities', 'fatalities_per_accident', 'accidents_per_100k', 'num_vehicles', 'hour_of_day', 'num_fatalities', 'num_drunk_drivers' ] labels = data['accidents_per_100k'] features = data.drop(non_feature_cols, axis=1) feature_names = features.columns scores_df = features.corrwith(labels, axis=0, method='pearson').to_frame('r_coef') scores_df['f_score'], scores_df['p_value'] = fs.f_regression( features, labels) # Sort by p value scores_df.sort_values('p_value', inplace=True) print(scores_df) # feature_subset_cols = scores_df[scores_df['p_value'] < 0.05].index.tolist() # features = features[feature_subset_cols] scaler = StandardScaler() features = scaler.fit_transform(features) # X_train, X_test, y_train, y_test = train_test_split(features, labels, # test_size=0.2, random_state=2020) models = { 'Linear Regression': (LinearRegression(), 'linreg'), 'Ridge': (RidgeCV(alphas=[0.01, 0.1, 1.0, 10.0], cv=5, scoring='neg_mean_squared_error'), 'ridge'), 'Elastic Net': (ElasticNetCV(l1_ratio=[0.1, 0.5, 0.7, 0.9, 0.95, 0.99, 1.0], alphas=[0.01, 0.1, 1.0, 10.0], max_iter=3000, cv=5), 'elastic_net'), 'Linear SVR': (LinearSVR(), 'svr') } for name, (model, suffix) in models.items(): print(name) print('-' * 20) model.fit(features, labels) y_pred = model.predict(features) utils.print_regression_metrics(labels, y_pred) utils.hist_resids(labels, y_pred, name, suffix) utils.resid_qq(labels, y_pred, name, suffix) utils.resid_plot(labels, y_pred, name, suffix) utils.feature_importance_regression(model, feature_names, name, suffix) utils.permutation_importances(model, features, labels, feature_names, name, suffix) print('#' * 50)
#Lasso lasso = Lasso(alpha=0.0005, random_state=1) print("Lasso score: {:.4f} \n".format(cv_rmse(lasso).mean())) alphas_alt = [ 14.5, 14.6, 14.7, 14.8, 14.9, 15, 15.1, 15.2, 15.3, 15.4, 15.5, 10, 5 ] e_alphas = [0.0001, 0.0002, 0.0003, 0.0004, 0.0005, 0.0006, 0.0007] #Ridge ridge = RidgeCV(alphas=alphas_alt, cv=kfolds) print("Ridge score: {:.4f} \n".format(cv_rmse(ridge).mean())) #ElasticNet elasticnet = ElasticNetCV(cv=kfolds, alphas=e_alphas) print("ElasticNet score: {:.4f} \n".format(cv_rmse(elasticnet).mean())) #Svr #svr = SVR() #print(cv_rmse(svr).mean()) #XGBoost xgboost = XGBRegressor(learning_rate=0.01, n_estimators=3460, max_depth=3, min_child_weight=0, gamma=0, subsample=0.7, colsample_bytree=0.7, objective='reg:linear',
features = features.dropna(axis=1) alpha_values = [] for a in range(1, 10001): alpha_values.append(a / 100) print "Started at " + str(datetime.now()) estimator_ridge = RidgeCV(alphas=alpha_values, cv=3) estimator_ridge.fit(features, goal) scores = cross_val_score(Ridge(alpha=estimator_ridge.alpha_), features, goal, cv=5) print "Ridge alpha " + str(estimator_ridge.alpha_) print str(np.mean(scores)) print scores estimator_lasso = LassoCV(alphas=alpha_values, cv=3) estimator_lasso.fit(features, goal) scores = cross_val_score(Lasso(alpha=estimator_lasso.alpha_), features, goal, cv=5) print "Lasso alpha " + str(estimator_lasso.alpha_) print str(np.mean(scores)) print scores estimator_elastic_net = ElasticNetCV(alphas=alpha_values, cv=3, n_jobs=-1) estimator_elastic_net.fit(features, goal) scores = cross_val_score(ElasticNet(alpha=estimator_elastic_net.alpha_), features, goal, cv=5) print "ElasticNet alpha " + str(estimator_elastic_net.alpha_) print str(np.mean(scores)) print scores print "Finished at " + str(datetime.now())
seed = sys.argv[1] p = sys.argv[2] corr = ('corr' if sys.argv[3] == 'TRUE' else '') np.random.seed(int(seed)) Data = np.loadtxt('Data_' + p + 'p_' + corr + seed + '.csv', delimiter=',') y = Data[:, 0][0:100] X = Data[:, 1:][0:100, :] y_test = Data[:, 0][100:200] X_test = Data[:, 1:][100:200, :] #---------------------------------------------------------------------------------------------------------------- # Elastic net with cross-validation over lambda and alpha. #---------------------------------------------------------------------------------------------------------------- Output = ElasticNetCV(l1_ratio=[.1, .5, .7, .9, .95, .99, 1], cv=10, max_iter=100000) Output.fit(X, y) # MAP model. m = (abs(Output.coef_) > 0) # Covariates in the MAP model. covariates = np.where(m == True)[0] # Root mean squared error of the MAP model on the test set. rmse = np.mean((y_test - X_test @ Output.coef_)**2)**.5 postProb = -99 items = pd.Series([covariates, postProb, rmse],
# Now, we use 5 fold cross-validation to estimate generalization error kf = KFold(len(x), n_folds=5) p = np.zeros_like(y) for train, test in kf: met.fit(x[train], y[train]) p[test] = met.predict(x[test]) r2_cv = r2_score(y, p) print('Method: {}'.format(name)) print('R2 on training: {}'.format(r2_train)) print('R2 on 5-fold CV: {}'.format(r2_cv)) print() # Construct an ElasticNetCV object (use all available CPUs) met = ElasticNetCV(n_jobs=-1, l1_ratio=[.01, .05, .25, .5, .75, .95, .99]) kf = KFold(len(x), n_folds=5) pred = np.zeros_like(y) for train, test in kf: met.fit(x[train], y[train]) pred[test] = met.predict(x[test]) print('[EN CV l1_ratio] RMSE on testing (5 fold), {:.2}'.format(np.sqrt(mean_squared_error(y, p)))) print('[EN CV l1_ratio] R2 on testing (5 fold), {:.2}'.format(r2_score(y, p))) print('') ''' # unit version
#X = np.array(tfidf_array) X = X_uni_bi_gram y = np.array(engagement_rate) print X binary_y_pre = [] for i in range(len(y)): if y[i]>0: binary_y_pre.append(1) else: binary_y_pre.append(0) binary_y = np.array(binary_y_pre) coef_path_linear_cv = LinearRegression(normalize=Normalize,fit_intercept=Fit_Intercept) coef_path_lasso_cv = LassoCV(normalize=Normalize, max_iter=Max_Iter, copy_X=True, cv=CV, verbose=Verbose, fit_intercept=Fit_Intercept, tol=Tol)#, alphas=Alphas) coef_path_elastic_cv = ElasticNetCV(normalize=Normalize,max_iter=Max_Iter, tol=Tol)#,alphas=Alphas) coef_path_logistic_cv = LogisticRegression( tol=Tol) coef_path_binary_x_logistic_cv = LogisticRegression( tol=Tol) coef_path_forest_cv = RandomForestClassifier(n_estimators = N_Estimators, max_features=number_of_features) binary_X = vectorizer_binary.fit_transform(corpus) coef_path_forest_cv.fit(X,binary_y) coef_path_lasso_cv.fit(X,y) coef_path_binary_x_logistic_cv.fit(binary_X,binary_y) coef_path_logistic_cv.fit(X,binary_y) coef_path_elastic_cv.fit(X,y) forest_cv_score = cross_validation.cross_val_score(coef_path_forest_cv, X, binary_y, n_jobs=2, cv=CV, scoring='roc_auc') lasso_cv_score = cross_validation.cross_val_score(coef_path_lasso_cv, X, y, n_jobs=2, cv=CV, scoring=Scoring) elastic_cv_score = cross_validation.cross_val_score(coef_path_elastic_cv, X, y, n_jobs=2, cv=CV, scoring=Scoring) logistic_cv_score = cross_validation.cross_val_score(coef_path_logistic_cv, X, binary_y, n_jobs=2, cv=CV, scoring='roc_auc')
min_samples_leaf=5), random_state=13, n_estimators=17), "AdaBoostAuto") build_auto(ARDRegression(normalize=True), "BayesianARDAuto") build_auto(BayesianRidge(normalize=True), "BayesianRidgeAuto") build_auto(DecisionTreeRegressor(random_state=13, min_samples_leaf=2), "DecisionTreeAuto", compact=False) build_auto( BaggingRegressor(DecisionTreeRegressor(random_state=13, min_samples_leaf=5), random_state=13, n_estimators=3, max_features=0.5), "DecisionTreeEnsembleAuto") build_auto(DummyRegressor(strategy="median"), "DummyAuto") build_auto(ElasticNetCV(random_state=13), "ElasticNetAuto") build_auto(ExtraTreesRegressor(random_state=13, min_samples_leaf=5), "ExtraTreesAuto") build_auto(GradientBoostingRegressor(random_state=13, init=None), "GradientBoostingAuto") build_auto(HuberRegressor(), "HuberAuto") build_auto(LarsCV(), "LarsAuto") build_auto(LassoCV(random_state=13), "LassoAuto") build_auto(LassoLarsCV(), "LassoLarsAuto") build_auto( OptimalLGBMRegressor(objective="regression", n_estimators=17, num_iteration=11), "LGBMAuto") build_auto(LinearRegression(), "LinearRegressionAuto") build_auto( BaggingRegressor(LinearRegression(), random_state=13, max_features=0.75),
l1_ratio=0.7 enet = ElasticNet(alpha = alpha, l1_ratio = l1_ratio) enet_model = enet.fit(X_train, y_train) y_pred_enet = enet_model.predict(X_test) r2_score_enet = r2_score(y_test, y_pred_enet) print(enet) print("r^2 on test data : %f" % r2_score_enet) # r^2 on test data : 0.100723 # plt.plot(enet.coef_, label='Elastic net coefficients') # plt.plot(coef, '--', label='original coefficients') # plt.legend(loc='best') # plt.title("R^2: %f" % (r2_score_enet)) # plt.show() # set the parameters alpha and l1_ratio by cross-validation from sklearn.linear_model import ElasticNetCV enetcv = ElasticNetCV(l1_ratio=[.1,.2,.3,.4,.5,.6,.7,.8,.9]) enetcv_model = enetcv.fit(X_train, y_train) y_pred_enetcv = enetcv_model.predict(X_test) r2_score_enetcv = r2_score(y_test, y_pred_enetcv) print(enetcv) print("r^2 on test data : %f" % r2_score_enetcv) # r^2 on test data : 0.22553 assert(r2_score_enetcv > r2_score_enet)
def fit(self, raw_array, aux_data_a_d=None, diff=False, feature_s_l=[], holdout_col=0, lag=1, positive_control=False, regression_algorithm_s = 'elastic_net', **kwargs): """ Performs an auto-regression of a given lag on the input array. Axis 0 indexes observations (schools) and axis 1 indexes years. For holdout_col>0, the last holdout_col years of data will be withheld from the fitting, which is ideal for training the algorithm. """ # Apply optional parameters if holdout_col > 0: raw_array = raw_array[:, :-holdout_col] if diff: array = np.diff(raw_array, 1, axis=1) else: array = raw_array # Create model and fit parameters Y = array[:, lag:].reshape(-1) X = np.ndarray((Y.shape[0], 0)) for i in range(lag): X = np.concatenate((X, array[:, i:-lag+i].reshape(-1, 1)), axis=1) # Y = X_t = A_1 * X_(t-lag) + A_2 * X_(t-lag+1)) + ... + A_lag * X_(t-1) + A_(lag+1) if positive_control: X = np.concatenate((X, array[:, lag:].reshape(-1, 1)), axis=1) if aux_data_a_d: for feature_s in feature_s_l: if holdout_col > 0: raw_array = aux_data_a_d[feature_s][:, :-holdout_col] else: raw_array = aux_data_a_d[feature_s] if diff: array = np.diff(raw_array, 1, axis=1) else: array = raw_array for i in range(lag): X = np.concatenate((X, array[:, i:-lag+i].reshape(-1, 1)), axis=1) estimatorX = Imputer(axis=0) X = estimatorX.fit_transform(X) estimatorY = Imputer(axis=0) Y = estimatorY.fit_transform(Y.reshape(-1, 1)).reshape(-1) if regression_algorithm_s == 'elastic_net': l1_ratio_l = [.1, .5, .7, .9, .95, .99, 1] alpha_l = np.logspace(-15, 5, num=11).tolist() max_iter = 1e5 # It's too slow when I make it high, so I'll keep it low for now model = ElasticNetCV(l1_ratio=l1_ratio_l, alphas=alpha_l, max_iter=max_iter, fit_intercept=True, normalize=True) elif regression_algorithm_s == 'gaussian_process': model = GaussianProcess() # This currently gives the following error: "Exception: Multiple input features cannot have the same target value." elif regression_algorithm_s == 'gradient_boosting': model = GradientBoostingRegressor(max_features='sqrt') elif regression_algorithm_s == 'linear_regression': model = LinearRegression(fit_intercept=True, normalize=True) elif regression_algorithm_s == 'random_forest': model = RandomForestRegressor(max_features='auto') model.fit(X, Y) if regression_algorithm_s in ['elastic_net', 'linear_regression']: with open(os.path.join(config.plot_path, 'coeff_list.txt'), 'a') as f: f.write('Lag of {0:d}:\n'.format(lag)) # f.write('\nElastic net: R^2 = %0.5f, l1_ratio = %0.2f, alpha = %0.1g' % # (model.score(X, Y), model.l1_ratio_, model.alpha_)) coeff_t = model.coef_ assert(not positive_control) # The coefficients won't currently line up for i_lag in range(lag): f.write('\ti_lag = {0:d}: {1:0.2g}\n'.format(lag-i_lag, coeff_t[i_lag])) for i_feature, feature_s in enumerate(feature_s_l): for i_lag in range(lag): f.write('\t{0}:\n\t\ti_lag = {1:d}: {2:0.2g}\n'.format(feature_s, lag-i_lag, coeff_t[lag*(i_feature+1) + i_lag])) return model
alphas = np.random.uniform(low=0, high=10, size=(50, )) ridgecv = RidgeCV(alphas=alphas, cv=10, normalize=True) ridgecv.fit(x_train, y_train) ridgecv.alpha_ ridge_model = Ridge(alpha=ridgecv.alpha_) ridge_model.fit(x_train, y_train) ridge_model.score(x_test, y_test) # we got the same r2 square using Ridge regression as well. So, it's safe to say there is no overfitting. # Elastic net elasticCV = ElasticNetCV(alphas=None, cv=10) elasticCV.fit(x_train, y_train) elasticCV.alpha_ # l1_ration gives how close the model is to L1 regularization, below value indicates we are giving equal #preference to L1 and L2 elasticCV.l1_ratio elasticnet_reg = ElasticNet(alpha=elasticCV.alpha_, l1_ratio=0.5) elasticnet_reg.fit(x_train, y_train) elasticnet_reg.score(x_test, y_test) # So, we can see by using different type of regularization, we still are getting the same r2 score. That means our OLS model has been well trained over the training data and there is no overfitting.
sales1 = sales.reset_index() brands_new1 = brands_new.reset_index() sales_merged = sales1.merge(brands_new1, how='left', on=['index', 'Brand_Family', 'SubBrand_Family']) sales_merged = sales_merged.set_index(['index']) print('Merged') #Брендам, к которым отношение не было зарегестрировано, я присвоил "среднее" отношение. Может быть, подпортил этим датасет. sales_merged.Affinity = sales_merged.Affinity.fillna(sales_merged.Affinity.mean()) sales_merged['Brand Character']= sales_merged['Brand Character'].fillna(sales_merged['Brand Character'].mean()) sales_merged['Functional Performance'] = sales_merged['Functional Performance'].fillna(sales_merged['Functional Performance'].mean()) #Здесь уже времени не хватало, попробовал построить хоть какую-нибудь модель, но она получилась совсем отвратительной. from sklearn.cross_validation import KFold from sklearn.linear_model import ElasticNetCV met = ElasticNetCV() features = sales_merged[['PMI_Portfolio_AVB_Boost', 'PMI_Portfolio_PFP_Boost', 'PMI_Portfolio_PPRP', 'PMI_Portfolio_SA', 'SubFam_Hostess', 'SubFam_PFP_Boost', 'SubFam_RAP', 'SubFam_SA', 'Fam_AVB_Boost', 'Fam_Hostess', 'Fam_PFP_Boost', 'Fam_RAP', 't', 'Affinity', 'Brand Character', 'Functional Performance']].as_matrix() target = sales_merged['Volume_Sales'].as_matrix() met = ElasticNetCV(n_jobs=-1, l1_ratio=[.01, .05, .25, .5, .75, .95, .99]) kf = KFold(len(target), n_folds=5) pred = np.zeros_like(target) for train, test in kf: met.fit(features[train], target[train]) pred[test] = met.predict(features[test])
plt.axis('tight') plt.show() #RidgeCV from sklearn.linear_model import RidgeCV model = RidgeCV(cv=20) model_ridge = model.fit(ratings_ext_input_sim2[X_features], ratings_ext_input_sim2['rating']) rating_predicted = model_ridge.predict(ratings_ext_input_sim2[X_features]) error = (rating_predicted - ratings_ext_input_sim2['rating']) np.mean(error*error) # 4.77 (0.633 good?) score=model_ridge.score(ratings_ext_input_sim2[X_features], ratings_ext_input_sim2['rating']) model_ridge.coef_ # Elastic Net from sklearn.linear_model import ElasticNetCV enet = ElasticNetCV(l1_ratio=0.5,cv = 10) # 1 for LASSO model_enet = enet.fit(ratings_ext_input_sim2[X_features], ratings_ext_input_sim2['rating']) rating_predicted = model_enet.predict(ratings_ext_input_sim2[X_features]) error = (rating_predicted - ratings_ext_input_sim2['rating']) np.mean(error*error) # 4.168 # alpha = 1, l1_ration = 0: very high 4.67 # alpha = 0.1, l1_ration = 0: very high 4.57 # alpha = 0.5, l1_ration = 0: very high 4.64 # alpha = 0.7, l1_ration = 0: very high 4.65 from sklearn.linear_model import lasso_path, enet_path model_enet.mse_path_ plt.figure(1) ax = plt.gca() ax.set_color_cycle(2 * ['b', 'r', 'g', 'c', 'k']) #l1 = plt.plot(-np.log10(alphas_lasso), coefs_lasso.T) l1 = plt.plot(-np.log10(model_enet.alphas_), model_enet.coef_, linestyle='--')
import numpy as np import pandas as pd from sklearn.linear_model import ElasticNetCV from sklearn.model_selection import train_test_split from sklearn.pipeline import make_pipeline from sklearn.preprocessing import PolynomialFeatures # NOTE: Make sure that the class is labeled 'target' in the data file tpot_data = pd.read_csv('PATH/TO/DATA/FILE', sep='COLUMN_SEPARATOR', dtype=np.float64) features = tpot_data.drop('target', axis=1).values training_features, testing_features, training_target, testing_target = \ train_test_split(features, tpot_data['target'].values, random_state=42) # Score on the training set was:-114.8406727584057 exported_pipeline = make_pipeline( PolynomialFeatures(degree=2, include_bias=False, interaction_only=False), ElasticNetCV(l1_ratio=0.6000000000000001, tol=0.01)) exported_pipeline.fit(training_features, training_target) results = exported_pipeline.predict(testing_features)
#times X_train,X_test,Y_train,Y_test = train_test_split(alldata,newY,test_size=0.3) #frequencies X_train,X_test,Y_train,Y_test = train_test_split(allfreqdata,newY,test_size=0.3) svr = SVR(cache_size=1500) svr_params = { 'C' : [1e-2,1,1e2] , 'epsilon' : [1e-3,1e-2,1e-1] } #fit without transforms 0.009 #fit with kld 0.017 #test with newy hier. interc. #takes looong enet_cv = ElasticNetCV(l1_ratio=[0.1,0.3,0.5,0.7,0.9],max_iter=2000) enet_cv.fit(X_tr_new,Y_train) rcv = RidgeCV(alphas=[1e-2,1e-1,1,10]) #rcv.fit(X_train,Y_train) svr_gs = GridSearchCV(svr,svr_params,verbose=1,n_jobs=-1) #svr_gs.fit(X_train,Y_train) #%% #visualization of posterior ERPs averaged over Pbs and epochs #for chan Fz posteriors = np.unique(np.round(bc_dict["01"],decimals=2)) avr_ERP_p_post_list = [get_average_ERPs_per_posterior(mat_dict[k],bc_dict[k],chan=4) for k in sorted(mat_dict.keys())]
#### assessing performance of the negative binomial regression model performance_negativebinomial = [] for x in [0.01,0.1,1,5,10]: cost = [] for a,b in cross_validation_object: resultingmodel = sm.NegativeBinomial(Y[a],X[a],loglike_method = 'geometric') #res = resultingmodel.fit(disp=False, maxiter = 200) res2 = resultingmodel.fit_regularized(alpha = x, maxiter = 200) cost.append(mean_squared_error(res2.predict(X[b]), Y[b])) performance_negativebinomial.append(np.mean(cost)) ##### Log linear model ########## not even close. from sklearn.linear_model import ElasticNetCV linear_fit = ElasticNetCV(cv = cross_validation_object, alphas = [0.01,0.1,1,5,10]) linear_fit.fit(X,np.log(Y+1)) mean_squared_error(np.exp(linear_fit.predict(X)) - 1, Y) ########## creating final model using train data + test data X_test,Y_test,junk = prepare_for_model('Dogs_Final_Test.csv',1) X,Y,junk = prepare_for_model('Dogs_Final_Train.csv',1) scaler = MinMaxScaler([0,1]) X_all = scaler.fit_transform(np.vstack((X_test,X))) Y_all = np.hstack((Y_test,Y)) Y_all = np.array([30 if i > 30 else i for i in Y_all]) final_model = sm.NegativeBinomial(Y_all,X_all,loglike_method = 'geometric') res2 = final_model.fit_regularized( alpha = 5, maxiter = 200)
warm_start=True) if (method == 5): print('Random forest 02') str_method = 'RandomForest02' r = RandomForestRegressor(n_estimators=90, max_depth=4, n_jobs=-1, random_state=ra1, verbose=0, warm_start=True) if (method == 6): print('ElasticNet') str_method = 'Elastic Net' r = ElasticNetCV() if (method == 7): print('GradientBoosting 01') str_method = 'GradientBoosting01' r = GradientBoostingRegressor(n_estimators=80, max_depth=5, learning_rate=0.05, random_state=ra1, verbose=0, warm_start=True, subsample=0.6, max_features=0.6) if (method == 8): print('GradientBoosting 02') str_method = 'GradientBoosting02'
X = scaler.transform(X) #add intercept X = np.hstack((np.ones(X.shape[0])[:,None],X)) train_X,test_X,train_Y,test_Y = train_test_split(X,y,test_size=0.1) #%% #try elastic net #alpha equals lambda here lambda_grid = [0.01, 0.1 , 1, 10,100] l1_ratio_grid = [0.1,0.3,0.5,0.7,0.9] enet_CV = ElasticNetCV(l1_ratio=l1_ratio_grid,alphas=lambda_grid,cv=3,n_jobs=-1,verbose=True) enet_CV.fit(train_X,train_Y) #%% #show enet_CV.score(test_X,test_Y) plt.plot(enet_CV.predict(test_X),test_Y,'o') #%% #try svr svr = SVR(kernel = 'rbf',C=1,cache_size=2000) SVR_params = { 'C' : [1e-1,1.0,1e2,1e3,1e4] } svr_rs = grid_search.RandomizedSearchCV(svr,SVR_params,verbose=True,n_jobs=-1)
from sklearn.linear_model import ElasticNetCV from sklearn.linear_model.tests.test_sparse_coordinate_descent import make_sparse_data from time import time import pylab as pl import numpy as np X, y = make_sparse_data(n_samples=500, n_features=2000, n_informative=200) n_cores = [1, 2, 4] n_alpha = [5, 10, 50, 100] times = [0] * 12 counter = 0 for _ in range(3): for core in n_cores: for alpha in n_alpha: clf = ElasticNetCV(n_jobs=core, n_alphas=alpha, l1_ratio=0.5, cv=10) print "core = %d, alpha = %d" % (core, alpha) t = time() clf.fit(X, y) times[counter%12] += (time() - t) print times counter += 1 # Got after doing the above. Just for future reference. core1_mp = [57.457534631093345, 72.31527137756348, 210.2204163869222, 379.9918119907379] core2_mp = [55.89718206723531, 51.196732918421425, 138.35079900423685, 239.67310031255087] core3_mp = [42.53018967310587, 49.97517212231954, 122.26631005605061, 204.76643363634744] core1_t = [60.99967805544535, 75.41305232048035, 219.61244002978006, 390.601344982783] core2_t = [46.21716833114624, 54.701584259668984, 144.06910300254822, 242.6696043809255] core3_t = [43.21849703788757, 49.07820804913839, 122.74103697141011, 205.75086871782938]
# check maybe 10 kfolds would be better kfolds = KFold(n_splits=10, shuffle=True, random_state=42) # Kernel Ridge Regression : made robust to outliers ridge = make_pipeline(RobustScaler(), RidgeCV(alphas=alphas_alt, cv=kfolds)) # LASSO Regression : made robust to outliers lasso = make_pipeline( RobustScaler(), LassoCV(max_iter=1e7, alphas=alphas2, random_state=14, cv=kfolds)) # Elastic Net Regression : made robust to outliers elasticnet = make_pipeline( RobustScaler(), ElasticNetCV(max_iter=1e7, alphas=e_alphas, cv=kfolds, l1_ratio=e_l1ratio)) # Gradient Boosting for regression gboost = GradientBoostingRegressor(n_estimators=3000, learning_rate=0.05, max_depth=4, max_features='sqrt', min_samples_leaf=15, min_samples_split=10, loss='huber', random_state=5) # LightGBM regressor. lgbm = lgb.LGBMRegressor(objective='regression', num_leaves=4, learning_rate=0.01,
md=dnn_reg(X_train,y_train,X_test,y_test) reg_eval(X_test,y_test,md) ###Lasso CV regression def reg_eval2(y_test,model): y_pred=model.predict(X_test) print("evaluation the results for model:",model) print("MSE:",mean_squared_error(y_test,y_pred)) print("R2:",r2_score(y_test,y_pred)) print("EVS:",explained_variance_score(y_test,y_pred)) lasso = LassoCV(cv=5, random_state=0,max_iter=10000) lasso.fit(X_train,y_train) reg_eval2(y_test,lasso) #ElasticNet Regressionb ela = ElasticNetCV(l1_ratio=0.8,normalize=True,max_iter=5000,random_state=77) ela.fit(X_train,y_train) print("R square:",ela.score(X_test,y_test)) reg_eval2(y_test,ela) #SVR Regression from sklearn.svm import LinearSVR LSVR=LinearSVR(epsilon=0.1,random_state=0, tol=1e-5,max_iter=10000) # scaler=RobustScaler() # pipe=Pipeline(steps=[("scaling",scaler),("rg",LSVR)]) LSVR.fit(X_train,y_train) reg_eval2(y_test,LSVR))
#%% interesting_ones = ['G13','G14','G15','G19','G21'] r2_mfcc = [] r2_stft = [] for chan in interesting_ones: y = Y[:,electrode_names.index(chan)] train_X,test_X,train_Y,test_Y = train_test_split(np.hstack([mfcc_X,X]),y,test_size=0.3) mfcctrain_X = train_X[:,:325] train_X = train_X[:,325:] l1_ratio_grid = [0.1,0.3,0.5,0.7,0.9] enet_CV = ElasticNetCV(l1_ratio=l1_ratio_grid,n_jobs=-1,verbose=True) enet_CV.fit(train_X,train_Y) r2_stft.append(enet_CV.score(test_X[:,325:],test_Y)) enet_CV.fit(mfcctrain_X,train_Y) r2_mfcc.append(enet_CV.score(test_X[:,:325],test_Y)) #%% #%for standardizing in lagged stimuli space scaler = preprocessing.StandardScaler() scaler.fit(X) X = scaler.transform(X) #add intercept #X = np.hstack((np.ones(X.shape[0])[:,None],X))
def do_validation(data_path, steps=10): allfiles = initialize(data_path) gbm = GradientBoostingRegressor(n_estimators=100, learning_rate=0.05, max_depth=6, min_samples_leaf=5, subsample=0.5) ada = AdaBoostRegressor(n_estimators=200, learning_rate=1) etree = ExtraTreesRegressor(n_estimators=200, n_jobs=-1, min_samples_leaf=5) rf = RandomForestRegressor(n_estimators=200, max_features=4, min_samples_leaf=5) kn = KNeighborsRegressor(n_neighbors=25) logit = LogisticRegression(tol=0.05) enet = ElasticNetCV(l1_ratio=0.75, max_iter=1000, tol=0.05) svr = SVR(kernel="linear", probability=True) ridge = Ridge(alpha=18) bridge = BayesianRidge(n_iter=500) gbm_metrics = 0.0 ada_metrics = 0.0 etree_metrics = 0.0 rf_metrics = 0.0 kn_metrics = 0.0 logit_metrics = 0.0 svr_metrics = 0.0 ridge_metrics = 0.0 bridge_metrics = 0.0 enet_metrics = 0.0 nnet_metrics = 0.0 logistic = LogisticRegression() rbm = BernoulliRBM(random_state=0, verbose=True) classifier = Pipeline(steps=[('rbm', rbm), ('logistic', logistic)]) for i in xrange(steps): driver = allfiles[i] df, Y = create_merged_dataset(driver) df['label'] = Y # Shuffle DF. df = df.reindex(np.random.permutation(df.index)) train = df[:100] label = train['label'] del train['label'] test = df[100:400] Y = test['label'] del test['label'] #to_drop = ['driver', 'trip', 'speed1', 'speed2', 'speed3', 'speed4', 'speed5', 'speed6', 'speed7', 'speed8', 'speed9', # 'speed10', 'speed11', 'speed12', 'speed13', 'speed14', 'speed15', 'speed16', 'speed17', 'speed18', 'speed19', # 'speed20', 'speed21', 'speed22', 'speed23', 'speed24', 'speed25', 'speed26', 'speed27', 'speed28', 'speed29', # 'speed30', 'speed31', 'speed32', 'speed33', 'speed34', 'speed35', 'speed36', 'speed37', 'speed38', 'speed39', # 'speed40', 'speed41', 'speed42', 'speed43', 'speed44', 'speed45', 'speed46', 'speed47', 'speed48', 'speed49', # 'speed50', 'speed51', 'speed52', 'speed53', 'speed54', 'speed55', 'speed56', 'speed57', 'speed58', 'speed59', # 'speed60', 'speed61', 'speed62', 'speed63', 'speed64', 'speed65', 'speed66', 'speed67', 'speed68', 'speed69', # 'speed70', 'speed71', 'speed72', 'speed73', 'speed74', 'speed75', 'speed76', 'speed77', 'speed78', 'speed79', 'speed80'] to_drop = ['driver', 'trip'] X_train = train.drop(to_drop, 1) X_test = test.drop(to_drop, 1) gbm.fit(X_train, label) Y_hat = gbm.predict(X_test) fpr, tpr, thresholds = metrics.roc_curve(Y, Y_hat) gbm_metrics += metrics.auc(fpr, tpr) ada.fit(X_train, label) Y_hat = ada.predict(X_test) fpr, tpr, thresholds = metrics.roc_curve(Y, Y_hat) ada_metrics += metrics.auc(fpr, tpr) etree.fit(X_train, label) Y_hat = etree.predict(X_test) fpr, tpr, thresholds = metrics.roc_curve(Y, Y_hat) etree_metrics += metrics.auc(fpr, tpr) rf.fit(X_train, label) Y_hat = rf.predict(X_test) fpr, tpr, thresholds = metrics.roc_curve(Y, Y_hat) rf_metrics += metrics.auc(fpr, tpr) kn.fit(X_train, label) Y_hat = kn.predict(X_test) fpr, tpr, thresholds = metrics.roc_curve(Y, Y_hat) kn_metrics += metrics.auc(fpr, tpr) # Linear models. to_drop = ['driver', 'trip', 'distance', 'sd_acceleration', 'final_angle', 'mean_acceleration', 'mean_avg_speed', 'sd_inst_speed', 'sd_avg_speed', 'mean_inst_speed', 'points'] X_train = train.drop(to_drop, 1) X_test = test.drop(to_drop, 1) logit.fit(X_train, label) Y_hat = [i[1] for i in logit.predict_proba(X_test)] fpr, tpr, thresholds = metrics.roc_curve(Y, Y_hat) logit_metrics += metrics.auc(fpr, tpr) svr.fit(X_train, label) Y_hat = svr.predict(X_test) fpr, tpr, thresholds = metrics.roc_curve(Y, Y_hat) svr_metrics += metrics.auc(fpr, tpr) ridge.fit(X_train, label) Y_hat = ridge.predict(X_test) fpr, tpr, thresholds = metrics.roc_curve(Y, Y_hat) ridge_metrics += metrics.auc(fpr, tpr) bridge.fit(X_train, label) Y_hat = bridge.predict(X_test) fpr, tpr, thresholds = metrics.roc_curve(Y, Y_hat) bridge_metrics += metrics.auc(fpr, tpr) enet.fit(X_train, label) Y_hat = enet.predict(X_test) fpr, tpr, thresholds = metrics.roc_curve(Y, Y_hat) enet_metrics += metrics.auc(fpr, tpr) classifier.fit(X_train, label) Y_hat = classifier.predict(X_test) fpr, tpr, thresholds = metrics.roc_curve(Y, Y_hat) nnet_metrics += metrics.auc(fpr, tpr) print "" print "GBM:", gbm_metrics/steps print "AdaBoost:", ada_metrics/steps print "Extra Trees:", etree_metrics/steps print "RF:", rf_metrics/steps print "KN:", kn_metrics/steps print "" print "Logit:", logit_metrics/steps print "SVR:", svr_metrics/steps print "Ridge:", ridge_metrics/steps print "BayesianRidge:", bridge_metrics/steps print "Elastic Net:", enet_metrics/steps print "Neural Networks:", nnet_metrics/steps print ""
('Poly', PolynomialFeatures(include_bias=True)), # alpha给定的是Ridge算法中,L2正则项的权重值 # alphas是给定CV交叉验证过程中,Ridge算法的alpha参数值的取值的范围 ('Linear', RidgeCV(alphas=np.logspace(-3, 2, 50), fit_intercept = False)) ]), Pipeline([ ('Poly', PolynomialFeatures(include_bias=True)), ('Linear', LassoCV(alphas=np.logspace(0, 1, 10), fit_intercept=False)) ]), Pipeline([ ('Poly', PolynomialFeatures(include_bias=True)), # la_ratio:给定EN算法中L1正则项在整个惩罚项中的比例,这里给定的是一个列表; # 表示的是在CV交叉验证的过程中,EN算法L1正则项的权重比例的可选值的范围 ('Linear', ElasticNetCV(alphas=np.logspace(0, 1, 10), l1_ratio=[.1, .5, .7, .9, .95, 1], fit_intercept=False)) ]) ] # 线性模型过拟合图形识别 plt.figure(facecolor='w') degree = np.arange(1, N, 4) dm = degree.size colors = [] for c in np.linspace(16711680, 255, dm): colors.append('#%06x' % int(c)) model = models[0] for i,d in enumerate(degree): plt.subplot(int(np.ceil(dm/2)), 2, i+1) plt.plot(x, y, 'ro', ms=10, zorder=N) model.set_params(Poly__degree=d)