def bayesian_ridge_regression(feature_array, label_array): clf = BayesianRidge(compute_score=True) clf.fit(feature_array, label_array) ols = LinearRegression() ols.fit(feature_array, label_array) n_features = 9 plt.figure(figsize=(6, 5)) plt.title("Weights of the model") plt.plot(clf.coef_, 'b-', label="Bayesian Ridge estimate") plt.plot(label_array, 'g-', label="Ground truth") plt.plot(ols.coef_, 'r--', label="OLS estimate") plt.xlabel("Features") plt.ylabel("Values of the weights") plt.legend(loc="best", prop=dict(size=12)) plt.figure(figsize=(6, 5)) plt.title("Histogram of the weights") plt.hist(clf.coef_, bins=n_features, log=True) # plt.plot(clf.coef_[feature_array], 5 * np.ones(len(feature_array)), # 'ro', label="Relevant features") plt.ylabel("Features") plt.xlabel("Values of the weights") plt.legend(loc="lower left") plt.figure(figsize=(6, 5)) plt.title("Marginal log-likelihood") plt.plot(clf.scores_) plt.ylabel("Score") plt.xlabel("Iterations") plt.show()
def ridreg(df,test): clf = BayesianRidge() target = df['count'] train = df[['time','temp']] test = test2[['time','temp']] clf.fit(train,target) final = [] print(test.head(3)) for i, row in enumerate(test.values): y=[] for x in row: x= float(x) y.append(x) # print(x) final.append(y) predicted_probs= clf.predict(final) # print(predicted_probs.shape) # predicted_probs = pd.Series(predicted_probs) # predicted_probs = predicted_probs.map(lambda x: int(x)) keep = pd.read_csv('data/test.csv') keep = keep['datetime'] # #save to file predicted_probs= pd.DataFrame(predicted_probs) print(predicted_probs.head(3)) predicted_probs.to_csv('data/submission3.csv',index=False)
def bayes_ridge_reg(self): br = BayesianRidge() br.fit(self.x_data, self.y_data) adjusted_result = br.predict(self.x_data) print "bayes ridge params", br.coef_, br.intercept_ print "bayes ridge accuracy", get_accuracy(adjusted_result, self.y_data) return map(int, list(adjusted_result))
def bayesRegr(source, target): # Binarize source clf = BayesianRidge() features = source.columns[:-1] klass = source[source.columns[-1]] clf.fit(source[features], klass) preds = clf.predict(target[target.columns[:-1]]) return preds
def fit_model_10(self,toWrite=False): model = BayesianRidge(n_iter=5000) for data in self.cv_data: X_train, X_test, Y_train, Y_test = data model.fit(X_train,Y_train) pred = model.predict(X_test) print("Model 10 score %f" % (logloss(Y_test,pred),)) if toWrite: f2 = open('model10/model.pkl','w') pickle.dump(model,f2) f2.close()
def train_BayesianRegressionModel( X, y, n_iter=300, tol=0.001, alpha_1=1e-06, alpha_2=1e-06, lambda_1=1e-06, lambda_2=1e-06, compute_score=False, fit_intercept=True, normalize=False, copy_X=True, verbose=False, ): """ Train a Bayesian regression model """ model = BayesianRidge( n_iter=n_iter, tol=tol, alpha_1=alpha_1, alpha_2=alpha_2, lambda_1=lambda_1, lambda_2=lambda_2, compute_score=compute_score, fit_intercept=fit_intercept, normalize=normalize, copy_X=copy_X, verbose=verbose, ) model = model.fit(X, y) return model
def br_modeling(data, y_name, candidates_location): from sklearn.linear_model import BayesianRidge temp = data.copy() candidates = get_variables("./%s" % candidates_location) temp = rf_trim(temp, y_name, candidates) model = BayesianRidge() res = model.fit(temp[candidates], temp[y_name]) joblib.dump(res, "./%sbr_model%s.pkl" % (y_name, datetime.datetime.today())) return res
class BayesianRRCalculator(EnergyCalculator): """Energy calculator using global feature vectors and Bayesian Ridge Regression.""" def __init__(self, feature_key): EnergyCalculator.__init__(self) self.ridge = BayesianRidge(fit_intercept=False) self.energy_key = 'BRR' self.feature_key = feature_key def fit(self, training_set, energy_key): """Fit the BRR model. The feature vectors with key=self.feature_key will be used for feature vectors. The energy with the specified energy_key will be the target function. Parameters: training_set : list of Nanoparticles energy_key : str """ feature_vectors = [p.get_feature_vector(self.feature_key) for p in training_set] energies = [p.get_energy(energy_key) for p in training_set] self.ridge.fit(feature_vectors, energies) def get_coefficients(self): return self.ridge.coef_ def set_coefficients(self, new_coefficients): self.ridge.coef_ = new_coefficients def set_feature_key(self, feature_key): self.feature_key = feature_key def compute_energy(self, particle): """Compute the energy using BRR. Assumes that a feature vector with key=self.feature_key is present in the particle. Parameters: particle : Nanoparticle """ brr_energy = np.dot(np.transpose(self.ridge.coef_), particle.get_feature_vector(self.feature_key)) particle.set_energy(self.energy_key, brr_energy)
def ridge(): ac = loadmat('./data/component_contribution_python.mat') S = ac['train_S'] df_S = pd.DataFrame(ac['train_S']) df_S_unique = df_S.T.drop_duplicates().T unque_cols = df_S_unique.columns.values.tolist() S = S[:, unque_cols] G = ac['G'] # b = ac['b'] b_list = json.load(open('./data/median_b.json')) b = np.asarray(b_list) b = np.reshape(b, (-1, 1)) # w = ac['w'] # pdb.set_trace() m, n = S.shape assert G.shape[0] == m assert b.shape == (n, 1) STG = np.dot(S.T, G) X = STG y = b # clf = Ridge(alpha=0.1,fit_intercept=False) # clf.fit(X, y) # print('R2',clf.score(X, y)) # print clf.coef_ reg = BayesianRidge(tol=1e-6, fit_intercept=False, compute_score=True) reg.fit(X, y) # print reg.coef_ conv = reg.sigma_ conv_coeff = [conv[i][i] for i in range(len(conv))] for num in conv_coeff[0:263]: if num < 500: print num pdb.set_trace()
def baysian_curve (): for degree in range(1,12): clf_poly = BayesianRidge(compute_score=True) #BayesianRidge library used clf_poly.fit(num.vander(X, degree), t) #Bayesian fit the data using L2(ridge) regularization #print(clf_poly.coef_[0:len(clf_poly.coef_)-1]) X_plot = num.linspace(0, 1, 500) y_plot = f(X_plot) y_mean, y_std = clf_poly.predict(num.vander(X_plot, degree), return_std=True) #this will return mean and normal fitted polynomial plot.figure(figsize=(6,5)) plot.title('Bayesian Curve Fitting for polynomial of Degree = ' + str(degree-1) , color='black') plot.errorbar(X_plot, y_mean, y_std, color='red', label="Polynomial Bayesian Regression", linewidth=lw) plot.plot(X_plot, y_plot, color='green', label="Sine Curve") plot.plot(X,t,'o') plot.ylabel("Output y") plot.xlabel("Feature X") plot.legend(loc="lower left") plot.subplots_adjust(hspace=2.0) #plot the generated data plot.subplots_adjust(wspace=0.2) plot.show()
def stackModel(train_stack, test_stack, y_train, myMetrics_type, n_runs=3, n_folds=5, use_StratifiedKFold=True): predictions_stack_sum_of_runs = np.zeros(len(test_stack)) oof_stack_sum_of_runs = np.zeros(len(train_stack)) for run in range(n_runs): predictions = np.zeros(test_stack.shape[0]) oof_stack_pre = np.zeros(train_stack.shape[0]) random_seed = 2015 + 1000 * (run + 1) if use_StratifiedKFold: sfolder = StratifiedKFold(n_splits=n_folds, random_state=random_seed, shuffle=True) else: sfolder = KFold(n_splits=n_folds, random_state=random_seed, shuffle=True) skf_tmp = sfolder.split(train_stack, y_train) for fold_, (trn_idx, val_idx) in enumerate(skf_tmp): print("Stack Run: %d, Fold: %d" % (run+1, fold_+1)) trn_data, trn_y = train_stack[trn_idx], y_train.iloc[trn_idx] val_data, val_y = train_stack[val_idx], y_train.iloc[val_idx] clf_3 = BayesianRidge() clf_3.fit(trn_data, trn_y) oof_stack_pre[val_idx] = clf_3.predict(val_data) predictions += clf_3.predict(test_stack) / n_folds score = MyMetrics(myMetrics_type).metricsFunc(oof_stack_pre, y_train) print("Stack Run: {}, CV val score: {:<8.5f}".format(run + 1, score)) predictions_stack_sum_of_runs = predictions_stack_sum_of_runs + predictions oof_stack_sum_of_runs = oof_stack_sum_of_runs + oof_stack_pre predictions_stack_mean_of_runs = predictions_stack_sum_of_runs / n_runs oof_stack_mean_of_runs = oof_stack_sum_of_runs / n_runs finalScore = MyMetrics(myMetrics_type).metricsFunc(oof_stack_mean_of_runs, y_train) print("Final score: {}".format(finalScore)) return predictions_stack_mean_of_runs, oof_stack_mean_of_runs, finalScore # train_stack = np.vstack([oof_lgb, oof_xgb]).transpose() # test_stack = np.vstack([predictions_lgb, predictions_xgb]).transpose()
def bayes_regression(x_train, x_test, y_train, y_test): model = BayesianRidge() model.fit(x_train, y_train) score = model.score(x_test, y_test) print("score", score) y_pred = model.predict(x_test) print("mean_squared_error", mean_squared_error(y_test, y_pred)) # 保存结果 # result = model.predict(test_df) # print(result) # result_df = pd.DataFrame(result, columns=['target']) # result_df.to_csv("0.098.txt", index=False, header=False) # 绘制学习率曲线 plot_learning_curve(model, title="learn_rate", X=x_train, y=y_train, cv=10)
def bay_ridge_model(self, X_train, y_train, X_test, y_test): bay_ridge_model = BayesianRidge(alpha_1=1, alpha_2=1, lambda_1=900, lambda_2=100) bay_ridge_model.fit(X_train, y_train) y_train_pred = bay_ridge_model.predict(X_train) y_test_pred = bay_ridge_model.predict(X_test) # Scoring the model print(bay_ridge_model.score(X_train, y_train)) print(bay_ridge_model.score(X_test, y_test)) print('MSE train: %.6f, MSE test: %.6f' % (mean_squared_error( y_train, y_train_pred), mean_squared_error(y_test, y_test_pred))) print('R^2 train: %.6f, R^2 test: %.6f' % (r2_score(y_train, y_train_pred), r2_score(y_test, y_test_pred)))
def fit_predict(self, m_current): obs, _ = self.get_obs_ixs(m_current) Uo, mo, wo = self.U[obs], np.copy(m_current)[obs], self.weighting[obs] ridge = BayesianRidge( fit_intercept=self.add_bias, alpha_init=self.alpha_init, lambda_init=self.lambda_init, alpha_1=self.alpha_1, alpha_2=self.alpha_2, lambda_1=self.lambda_1, lambda_2=self.lambda_2, ) ridge.fit(Uo, mo, sample_weight=wo) pred, pred_std = ridge.predict(self.U, return_std=True) pred[obs] = m_current[obs] # standard deviation of observed regions is 0 pred_std[obs] = 0.0 self.model = ridge return pred, pred_std
def from_station(station, departing=False): journeys = at_station(station) livst = station == 'LIVST' X = [row[2:4] for row in journeys if None not in [row[3], row[4], row[5]]] Y = [ row[-1 if departing else -2] for row in journeys if None not in [row[3], row[4], row[5]] ] if livst: X = [row[2:4] for row in journeys if None not in [row[3], row[4]]] Y = [row[-2] for row in journeys if None not in [row[3], row[4]]] reg = BayesianRidge() knn = KNeighborsRegressor(n_neighbors=2) X_train, X_test, Y_train, Y_test = train_test_split( X, Y, test_size=0.2, ) reg.fit(X_train, Y_train) knn.fit(X_train, Y_train) Y_pred = [int(round(num)) for num in reg.predict(X_test)] Y_pred2 = [int(round(num)) for num in knn.predict(X_test)] # df = pd.DataFrame({'Actual': Y_test, 'Predicted': Y_pred, 'KNN': Y_pred2}) # df1 = df.head(25) # print(df1) # print('Mean Absolute', metrics.mean_absolute_error(Y_test, Y_pred), # metrics.mean_absolute_error(Y_test, Y_pred2)) # print('Mean Squared', metrics.mean_squared_error(Y_test, Y_pred), # metrics.mean_squared_error(Y_test, Y_pred2)) # print('Root Mean Squared', # np.sqrt((metrics.mean_squared_error(Y_test, Y_pred))), # np.sqrt((metrics.mean_squared_error(Y_test, Y_pred2)))) # print(reg.predict([[1, 15]])) # print(Y_pred) # format_results(Y_test, Y_pred, Y_pred2) return reg
def fit_polynomial_bayesian_skl(X, Y, degree, lambda_shape=1.e-6, lambda_invscale=1.e-6, padding=10, n=100, X_unknown=None): X_v = pol.polyvander(X, degree) clf = BayesianRidge(lambda_1=lambda_shape, lambda_2=lambda_invscale) clf.fit(X_v, Y) coeff = np.copy(clf.coef_) # there some weird intercept thing # since the Vandermonde matrix has 1 at the beginning, just add this # intercept to the first coeff coeff[0] += clf.intercept_ ret_ = [coeff] # generate the line x = np.linspace(X.min()-padding, X.max()+padding, n) x_v = pol.polyvander(x, degree) # using the provided predict method y_1 = clf.predict(x_v) # using np.dot() with coeff y_2 = np.dot(x_v, coeff) ret_.append(((x, y_1), (x, y_2))) if X_unknown is not None: xu_v = pol.polyvander(X_unknown, degree) # using the predict method yu_1 = clf.predict(xu_v) # using np.dot() with coeff yu_2 = np.dot(xu_v, coeff) ret_.append(((X_unknown, yu_1), (X_unknown, yu_2))) return ret_
def fit(self, smiles, y=None, *, X_scaler=None, y_scaler=None, **kwargs): """ Parameters ---------- smiles: list[str] SMILES for training. y: pandas.DataFrame Target properties for training. X_scaler: Scaler (optional, not implement) Scaler for transform X. y_scaler: Scaler (optional, not implement) Scaler for transform y. kwargs: dict Parameters pass to BayesianRidge initialization. """ if self._mdl: raise RuntimeError('estimators have been set.' 'If you want to re-train these estimators,' 'please use `remove_estimator()` method first.') if not isinstance(y, pd.DataFrame): raise TypeError( 'please package all properties into a pd.DataFrame') # remove NaN fromm X desc = self._descriptor.transform( smiles, return_type='df').reset_index(drop=True) y = y.reset_index(drop=True) desc.dropna(inplace=True) y = y.loc[desc.index] for c in y: y_ = y[c] # get target property. # remove NaN from y_ y_.dropna(inplace=True) desc_ = desc.loc[y_.index] desc_ = desc_.values mdl = BayesianRidge(compute_score=True, **kwargs) mdl.fit(desc_, y_) self._mdl[c] = mdl
def mr_link_ridge(outcome_geno, r_sq_mat, exposure_betas, causal_exposure_indices, outcome_phenotype, upper_r_sq_threshold=0.99, lower_r_sq_threshold=0.1, prune_r_sq_threshold=0.95, ): """ Does MR-link solved by ridge regression. Please note that the p value and se is uncorrected. so these are usually _very_ conservative. See the MR-link manuscript for details. :param outcome_geno: outcome genotypes :param r_sq_mat: R^2 matrix in order of genotypes of outcome geno :param exposure_betas: beta estimates of the exposure instrumental variables. :param causal_exposure_indices: indices of the exposure instrumental variables. :param outcome_phenotype: outcome phenotype vector :param upper_r_sq_threshold: the upper r_sq threshold for which the variants around the IVs are pruned. :return: beta, se and p value estimate of the MR-link estimate """ design_mat = make_mr_link_design_matrix(outcome_geno, r_sq_mat, exposure_betas, causal_exposure_indices, upper_r_sq_threshold=upper_r_sq_threshold, lower_r_sq_threshold=lower_r_sq_threshold, prune_r_sq_threshold=prune_r_sq_threshold ) ridge_fit = BayesianRidge(fit_intercept=False) ridge_fit.fit(design_mat, outcome_phenotype) t_stat = np.abs(ridge_fit.coef_[0] / np.sqrt(ridge_fit.sigma_[0, 0])) p_val = 2 * scipy.stats.norm.sf(t_stat) return ridge_fit.coef_[0], np.sqrt(ridge_fit.sigma_[0,0]), p_val
def rand_search_lm(funcs,df,var_y,var_media,var_nonmedia,n): clf = BayesianRidge(compute_score=True,fit_intercept=True) my_coefs=np.zeros((n,len(var_media)+ len(var_nonmedia)+1)) my_scores=np.zeros((n,1)) par_list=ram_par_create(funcs,df,var_media,n) df_var=df.loc[:,var_y+var_nonmedia+var_media] if funcs == "log_y": y=np.log(df[var_y]/(100-df[var_y])).values elif funcs == "Simple Power": y=df[var_y].values elif funcs == "S curves": y=df[var_y].values elif funcs == "log_log": y=np.log(df[var_y]).values ###modeling for iteration in range(par_list.shape[0]): X = df_var.iloc[:,1:].values for j in range(len(var_media)): if funcs == "log_y": X[:,j+len(var_nonmedia)]=Adstock(df[var_media[j]],par_carryover=par_list[iteration][j]) elif funcs == "Simple Power": X[:,j+len(var_nonmedia)]=Adstock(df[var_media[j]],par_carryover=par_list[iteration][j])**par_list[iteration][j+len(var_media)] elif funcs == "S curves": X[:,j+len(var_nonmedia)]=np.exp(par_list[iteration][j+len(var_media)]-par_list[iteration][j+2*len(var_media)]/(Adstock(df[var_media[j]],par_carryover=par_list[iteration][j])+0.00001)) elif funcs == "log_log": X[:,j+len(var_nonmedia)]=np.log(Adstock(df[var_media[j]],par_carryover=par_list[iteration][j])+0.00001) clf.fit(X, y) my_coefs[iteration,0]=clf.intercept_ my_coefs[iteration,1:]=clf.coef_ my_scores[iteration,0]=clf.score(X,y) return my_coefs,my_scores,par_list
def test_bayesian_ridge_score_values(): """Check value of score on toy example. Compute log marginal likelihood with equation (36) in Sparse Bayesian Learning and the Relevance Vector Machine (Tipping, 2001): - 0.5 * (log |Id/alpha + X.X^T/lambda| + y^T.(Id/alpha + X.X^T/lambda).y + n * log(2 * pi)) + lambda_1 * log(lambda) - lambda_2 * lambda + alpha_1 * log(alpha) - alpha_2 * alpha and check equality with the score computed during training. """ X, y = diabetes.data, diabetes.target n_samples = X.shape[0] # check with initial values of alpha and lambda (see code for the values) eps = np.finfo(np.float64).eps alpha_ = 1. / (np.var(y) + eps) lambda_ = 1. # value of the parameters of the Gamma hyperpriors alpha_1 = 0.1 alpha_2 = 0.1 lambda_1 = 0.1 lambda_2 = 0.1 # compute score using formula of docstring score = lambda_1 * log(lambda_) - lambda_2 * lambda_ score += alpha_1 * log(alpha_) - alpha_2 * alpha_ M = 1. / alpha_ * np.eye(n_samples) + 1. / lambda_ * np.dot(X, X.T) M_inv = pinvh(M) score += - 0.5 * (fast_logdet(M) + np.dot(y.T, np.dot(M_inv, y)) + n_samples * log(2 * np.pi)) # compute score with BayesianRidge clf = BayesianRidge(alpha_1=alpha_1, alpha_2=alpha_2, lambda_1=lambda_1, lambda_2=lambda_2, n_iter=1, fit_intercept=False, compute_score=True) clf.fit(X, y) assert_almost_equal(clf.scores_[0], score, decimal=9)
def stack(*avg): train_stack = np.vstack(avg[0]).transpose() test_stack = np.vstack(avg[1]).transpose() y_train = avg[2] folds_stack = StratifiedKFold(n_splits=10, shuffle=True, random_state=8888) oof_stack = np.zeros(train_stack.shape[0]) predictions = np.zeros(test_stack.shape[0]) for fold_, (trn_idx, val_idx) in enumerate(folds_stack.split(train_stack, y_train)): print("fold :", fold_ + 1) trn_data, trn_y = train_stack[trn_idx], y_train[trn_idx] val_data, val_y = train_stack[val_idx], y_train[val_idx] stacking = BayesianRidge() stacking.fit(trn_data, trn_y) oof_stack[val_idx] = stacking.predict(val_data) predictions += stacking.predict(test_stack) / folds_stack.n_splits print("stacking auc score: {:<8.8f}".format( roc_auc_score(y_train, oof_stack))) return predictions
def predict_price(area) -> float: """ This method must accept as input an array `area` (represents a list of areas sizes in sq feet) and must return the respective predicted prices (price per sq foot) using the linear regression model that you build. You can run this program from the command line using `python3 regression.py`. """ response = requests.get(TRAIN_DATA_URL) s = response.content d = pd.read_csv(io.StringIO(s.decode('utf-8'))) d = d.T d.reset_index(level=0, inplace=True) d = d[1:] d.columns = ['area', 'price'] model = BayesianRidge() model.fit(numpy.reshape(numpy.array(d['area']), (-1, 1)), numpy.reshape(numpy.array(d['price']), (-1, 1))) area = area.reshape(-1, 1) print(model.coef_) print(model.intercept_) return model.predict(area)
def Train_Model(X_Train, Y_Train, X_Test): #fitting Bayseian Regression to the training dataset regressor = BayesianRidge() fitResult = regressor.fit(X_Train, Y_Train) #predicting the model for X_test b = fitResult.predict(X_Test) #Performing exponential function on the predicted values and exporting the output values to an out excel file b = np.exp(b) np.savetxt('out.csv', b)
def test_bayesian_initial_params(): # Test BayesianRidge with initial values (alpha_init, lambda_init) X = np.vander(np.linspace(0, 4, 5), 4) y = np.array([0., 1., 0., -1., 0.]) # y = (x^3 - 6x^2 + 8x) / 3 # In this case, starting from the default initial values will increase # the bias of the fitted curve. So, lambda_init should be small. reg = BayesianRidge(alpha_init=1., lambda_init=1e-3) # Check the R2 score nearly equals to one. r2 = reg.fit(X, y).score(X, y) assert_almost_equal(r2, 1.)
def __get_fitness(self, genes, x, y, initialFitness=None, runs=1): nog = np.count_nonzero(genes) if nog == 0: return self.__get_fitness_instance(0, 1000, 1, len(genes)) hashedGenes = hash(genes.data.tobytes()) self.FitnessCites += 1 if initialFitness is not None and hashedGenes in self.Memo: #Tabu mem search self.SavedCalculationTimes += 1 return self.__get_fitness_instance(initialFitness.Metric - 10, 1000, initialFitness.Std + 0.1, initialFitness.NOG) clf = BayesianRidge(n_iter=100) clf.fit(x[:, np.where(genes == 1)[0]], y) scores2, bic = regression_accuracy_scorer( clf, x[:, np.where(genes == 1)[0]], y) fitness = self.__get_fitness_instance(scores2, bic, 0, nog) self.Memo[hashedGenes] = fitness return fitness
def model(): x_train, x_test, y_train, y_test = feature_sel_data_split() model_1 = LinearRegression() model_1.fit(x_train, y_train) model_1_result = model_1.predict(x_test) model_1_file = open("data/linear_reg.model", 'wb') pickle.dump(model_1, model_1_file) model_1_file.close() ridge = Ridge(alpha=0.5) ridge.fit(x_train, y_train) ridge_result = ridge.predict(x_test) ridge_file = open("data/ridge.model", 'wb') pickle.dump(ridge, ridge_file) ridge_file.close() lasso = Lasso(alpha=0.01) lasso.fit(x_train, y_train) lasso_result = lasso.predict(x_test) lasso_file = open("data/lasso.model", 'wb') pickle.dump(lasso, lasso_file) lasso_file.close() bayesian = BayesianRidge() bayesian.fit(x_train, y_train) bayesian_result = bayesian.predict(x_test) bayesian_file = open("data/bayesian.model", 'wb') pickle.dump(bayesian, bayesian_file) bayesian_file.close() elastic = ElasticNet(alpha=0.01) elastic.fit(x_train, y_train) elastic_result = elastic.predict(x_test) elastic_file = open("data/elastic.model", 'wb') pickle.dump(elastic, elastic_file) elastic_file.close() return y_test, [ model_1_result, ridge_result, elastic_result, lasso_result, bayesian_result ]
def get_stacking(self, oof_list, prediction_list, labels): train_stack = np.vstack(oof_list).transpose() test_stack = np.vstack(prediction_list).transpose() repeats = len(oof_list) #RepeatedKFold p次k折交叉验证 kfolder = RepeatedKFold(n_splits=self.n_fold, n_repeats=repeats, random_state=4590) kfold = kfolder.split(train_stack, labels) preds_list = list() stacking_oof = np.zeros(train_stack.shape[0]) for train_index, vali_index in kfold: k_x_train = train_stack[train_index] k_y_train = labels.loc[train_index] k_x_vali = train_stack[vali_index] gbm = BayesianRidge(normalize=True) gbm.fit(k_x_train, k_y_train) k_pred = gbm.predict(k_x_vali) stacking_oof[vali_index] = k_pred preds = gbm.predict(test_stack) preds_list.append(preds) fold_mae_error = mean_absolute_error(labels, stacking_oof) print(f'stacking fold mae error is {fold_mae_error}') fold_score = 1 / (1 + fold_mae_error) print(f'fold score is {fold_score}') preds_columns = [ 'preds_{id}'.format(id=i) for i in range(self.n_fold * repeats) ] preds_df = pd.DataFrame(data=preds_list) preds_df = preds_df.T preds_df.columns = preds_columns stacking_prediction = list(preds_df.mean(axis=1)) return stacking_oof, stacking_prediction
def main(): # Multiple Regression using Backward Elimination and Cross_Val # df = pd.read_csv('train_adjusted.csv') df = pd.read_csv('train_adjusted.csv') columns = list(df.columns.values) # train all columns x = df[[ 'LotArea', 'OverallQual', 'OverallCond', 'MasVnrArea', 'BsmtFinSF1', 'BsmtUnfSF', '1stFlrSF', '2ndFlrSF', 'BsmtFullBath', 'BedroomAbvGr', 'KitchenAbvGr', 'TotRmsAbvGrd', 'Fire2laces', 'GarageYrBlt', 'GarageCars', 'WoodDeckSF', 'ScreenPorch' ]] # y = df.iloc[:, lambda df: [38]].values y = df[['SalePrice']] poly = PolynomialFeatures(degree=2, include_bias=False) x = poly.fit_transform(x) sds = StandardScaler() x = sds.fit_transform(x) model = BayesianRidge() model.fit(x, y) scores = cross_val_score(model, x, y, cv=10) print(scores) predictions = cross_val_predict(model, x, y, cv=10) plt.scatter(y, predictions) coeff = metrics.r2_score(y, predictions) print("R^2 Value:", coeff) rmse = np.sqrt(mean_squared_error(predictions, y)) print('Root Mean Squared Error:', rmse) plt.scatter(y, predictions) plt.xlabel("Actual Sales Price") plt.ylabel("Predictions")
def bayeImpute(data, target_col, verbose=0): ''' currently, BayesianRidge. return the imputated data, and model coefficient ''' from sklearn.linear_model import BayesianRidge, LinearRegression from sklearn.ensemble import RandomForestRegressor model = BayesianRidge() # model = LinearRegression() # model = RandomForestRegressor() original_data = np.copy(data) target = data[:, target_col] data = np.delete(data, obj=target_col, axis=1) #remove the missing-value column mv_mask = pd.isnull(target) if verbose: print("number of imputated cells: {}".format( sum(pd.isnull(original_data[:, target_col])))) x_test = data[mv_mask] x_train = data[~mv_mask] y_train = target[~mv_mask] # check if valid to regression: wether only one value exist in target. # If happen, use default "mean" method (which is all same) is_other_value = False in (y_train == y_train[0]) if (not is_other_value): model = "mean" original_data[mv_mask, target_col] = y_train[0] * len(mv_mask) return original_data, model model.fit(x_train, y_train) result = model.predict(x_test) original_data[ mv_mask, target_col] = result #put the imputation result back to original data, following the index # print("coefficient: {}".format(model.coef_)) return original_data, model
def bayfit(data): #Bayesian regression - very bad lw = 2 x = np.arange(len(data) - 1) degree = 3 clf_poly = BayesianRidge() clf_poly.fit(np.vander(x, degree), data[x]) x_plot = np.arange(len(data)) y_mean, y_std = clf_poly.predict(np.vander(x_plot, degree), return_std=True) plt.figure(figsize=(6, 5)) plt.errorbar(x_plot, y_mean, y_std, color='navy', label="Polynomial Bayesian Ridge Regression", linewidth=lw) plt.plot(x_plot, data, color='gold', linewidth=lw, label="Ground Truth") plt.ylabel("Output y") plt.xlabel("Feature X") plt.legend(loc="lower left") plt.show()
def train_classiifer(X_train, y_train, to_tune, classifier): # Initialize Classifier. clf = BayesianRidge() clf = SVR(kernel='rbf', C=1e3, gamma=0.1) #clf = RandomForestRegressor() if classifier: clf = classifier to_tune = False if to_tune: # Grid search: find optimal classifier parameters. param_grid = {'alpha_1': sp_rand(), 'alpha_2': sp_rand()} param_grid = {'C': sp_rand(), 'gamma': sp_rand()} rsearch = RandomizedSearchCV(estimator=clf, param_distributions=param_grid, n_iter=5000) rsearch.fit(X_train, y_train) # Use tuned classifier. clf = rsearch.best_estimator_ # Trains Classifier clf.fit(X_train, y_train) return clf
def stacking_model(oof_lgb, oof_xgb, predictions_lgb, predictions_xgb): train_stack = np.vstack([oof_lgb, oof_xgb]).transpose() test_stack = np.vstack([predictions_lgb, predictions_xgb]).transpose() folds_stack = RepeatedKFold(n_splits=9, n_repeats=2, random_state=4590) oof_stack = np.zeros(train_stack.shape[0]) predictions = np.zeros(test_stack.shape[0]) for fold_, (trn_idx, val_idx) in enumerate(folds_stack.split(train_stack, y_train_)): trn_data, trn_y = train_stack[trn_idx], y_train_[trn_idx] val_data, val_y = train_stack[val_idx], y_train_[val_idx] clf_3 = BayesianRidge() clf_3.fit(trn_data, trn_y) oof_stack[val_idx] = clf_3.predict(val_data) predictions += clf_3.predict(test_stack) / 18 loss = mean_squared_error(y_train_, oof_stack) print('merge loss:', loss) return predictions
def bayesianRidge(X_train, y_train, X_test, y_test, Identifier): ''' Fits Bayesian Ridge model on the data provided after feature selection. :param X_train: the data frame containing the selected features for training :param y_train: the data frame of target variable used for training :param X_test: the data frame of containing the selected features for testing :param y_test: the data frame of target variable for testing :param Identifier: whether called for time series prediction or news prediction :return: returns the error score and predicted values ''' bayesianRidge = BayesianRidge() bayesianRidge.fit(X_train, y_train) # save the model to disk if Identifier != "News": filename = '../Models/BayesianRidge' + 'TimeSeries' + '.sav' else: filename = '../Models/BayesianRidge' + 'News' + '.sav' joblib.dump(bayesianRidge, filename) prediction = bayesianRidge.predict(X_test) prediction = pd.DataFrame(prediction, index=y_test.index) error = mean_absolute_error(y_test, prediction) if Identifier != "News": makeGraph(y_test, valueFromTimeSeries=prediction, name="Time Series - Bayesian Ridge") else: makeGraph(y_test, valueFromNews=prediction, name="News - Bayesian Ridge") #print(prediction) statistic, pvalue = mannwhitneyu(y_test, pd.Series(prediction[0])) return error, prediction, pvalue
def ApplyBayesianRidge(self, train, test, cross_validation, full_train, config): BR = BayesianRidge(verbose=True, n_iter=1000, tol=0.00001) target_train = train[['Hazard']] cross_validation_test = cross_validation[['Hazard']] prepared_train = train[train.columns.difference(['Id', 'Hazard'])] print "prepared_train meta" print "shape", prepared_train.shape print prepared_train.head(3) BR.fit(prepared_train, target_train) dt = BR.predict(test[test.columns.difference(['Id'])]) print "prediction score on cross validation" print BR.score( cross_validation[cross_validation.columns.difference( ['Id', 'Hazard'])], cross_validation_test) dt_cv = BR.predict( cross_validation[cross_validation.columns.difference( ['Id', 'Hazard'])]) test['Hazard'] = self.clipForecastValue(dt) cross_validation['predicted_Hazard'] = self.clipForecastValue(dt_cv) # print "sorted feature importance" # print sorted(zip(map(lambda x: round(x, 4), BR.feature_importances_), names), # reverse=True) print "regression model coefficients" print BR.coef_ print "estimated precision of the noise" print BR.alpha_ print "estimated precision of the weights" print BR.lambda_ print "value of the objective function" print BR.scores_ return test, cross_validation
def bayesian_ridge(X_train, y_train, X_test, y_test): ''' Purpose: Use Bayesian Ridge to calculate accuracy Input: X_train, y_train, X_test, y_test Output: accuracy_score ''' clf = BayesianRidge(compute_score=True) clf = clf.fit(X_train, y_train) y_pred = clf.predict(X_test) y_pred = y_pred.round() #ols = LinearRegression() #ols.fit(X, y) return metrics.accuracy_score(y_test, y_pred)
def _fit_bayesian_ridge(X: np.ndarray, y: np.ndarray, fit_intercept: bool = False, **kwargs) -> Dict[str, Any]: """ Returns the solution `a` to the linear problem `Xa=y` obtained by using Bayesian ridge regression as implemented in scitkit-learn in the form of a dictionary with a key named `parameters`. Parameters ----------- X fit matrix y target array fit_intercept center data or not, forwarded to sklearn """ brr = BayesianRidge(fit_intercept=fit_intercept, **kwargs) brr.fit(X, y) results = dict() results['parameters'] = brr.coef_ return results
def MSE_Bay(train_data,lag,t_ahead,s_i): sample_x = np.transpose(train_data[:,:-lag]) for i in range(1, lag): sample_x = np.hstack([sample_x, np.transpose(train_data[:,i:-(lag-i)])]) sample_x = sample_x[:-t_ahead,:] # num_stream = 1 slding_predict_t = 730 landmark_win_ini_size = 367 # for s_i in range(num_stream): sample_y_si = np.transpose(train_data[s_i,t_ahead+lag-1:]) # print(sample_y_si[367]) reg_si = BayesianRidge() pre_y = [] act_y = [] for landmark_win in range(slding_predict_t): train_x = sample_x[:landmark_win_ini_size+landmark_win,:] train_y = sample_y_si[:landmark_win_ini_size+landmark_win] reg_si.fit(train_x,train_y) y_hat = reg_si.predict(sample_x[landmark_win_ini_size+landmark_win:landmark_win_ini_size+landmark_win+1,:]) pre_y.append(y_hat) act_y.append(sample_y_si[landmark_win_ini_size+landmark_win:landmark_win_ini_size+landmark_win+1]) # plt.plot(range(landmark_win_ini_size+1,landmark_win_ini_size+landmark_win+2),pre_y,label='prediction s'+str(s_i)) # plt.plot(range(landmark_win_ini_size+1,landmark_win_ini_size+landmark_win+2),act_y,label='actual') # plt.legend() # plt.show() # print(pre_y) # print(act_y) MSE = 0 for i in range (0,len(pre_y)): if not(np.isnan(pre_y[i])): MSE = MSE + (pre_y[i]-act_y[i])**2 # pre = np.array(pre_y) # act = np.array(act_y) # print(np.sum(pre-act)) return MSE,pre_y
def myRidgeHH(data, hh=1, dd=1): import numpy as np import pandas as pd import matplotlib.pyplot as plt "produces a forecast for half hour hh on day dd" "format is numpy array of shape (n_households,)" from sklearn.linear_model import BayesianRidge data1 = data[data.Day == dd] past1 = data1[data1.Week != 22] past = past1[past1.Week != 21] if hh == 1: h = [1, 2, 3] elif hh == 2: h = [1, 2, 3, 4] elif hh in range(3, 47): h = [hh - 2, hh - 1, hh, hh + 1, hh + 2] elif hh == 47: h = [45, 46, 47, 48] elif hh == 48: h = [46, 47, 48] XX = pd.DataFrame() YY = pd.DataFrame() for i in h: XX = XX.append(past[past.HH == i], ignore_index=True) YY = YY.append(past1[past1.HH == i], ignore_index=True) X = XX.transpose()[4:] Y = YY[(YY.Week == YY.Week.max()) & (YY.HH == hh)].transpose()[4:] BR = BayesianRidge() BR.fit(X, Y) XX_new = YY[YY.Week != YY.Week.min()] X_new = XX_new.transpose()[4:] forecast_hh_dd = BR.predict(X_new) return forecast_hh_dd
def Bayesian( self, X_train, X_test, y_train, y_test, date, q ): #self,Xtrain,Xtest,ytrain,ytest,['dd','mm',yyyy'],indexofalgo] model = BayesianRidge(compute_score=True) y_train.shape X_train.shape model.fit(X_train, y_train) y_pred = model.predict(X_test) df = pd.DataFrame({ 'Actual': y_test.flatten(), 'Predicted': y_pred.flatten() }) mse = mean_squared_error(y_test, y_pred) predictedval = (model.predict([date])) self.mse[q] = mse self.predicted_avgtemp[q] = predictedval[0] self.predicted_mintemp[ q] = '-' # because BAyesian only does one val prediction self.predicted_maxtemp[ q] = '-' # because BAyesian only does one val prediction
def br_modeling(data,y_name,candidates_location): from sklearn.linear_model import BayesianRidge temp=data.copy() print("made temp copy") candidates=get_variables("./%s"%candidates_location) print("got candidates for regressors") temp=rf_trim(temp,y_name,candidates) print("trimmed dataset") model=BayesianRidge() print("assigned model") res=model.fit(temp[candidates],temp[y_name]) print("fit model") joblib.dump(res,"./%sbr_model%s.pkl"%(y_name,datetime.datetime.today())) print("saved model") return res
def build_bayesian_rr(x_train, y_train, x_test, y_test, n_features): """ Constructing a Bayesian ridge regression model from input dataframe :param x_train: features dataframe for model training :param y_train: target dataframe for model training :param x_test: features dataframe for model testing :param y_test: target dataframe for model testing :return: None """ clf = BayesianRidge() clf.fit(x_train, y_train) y_pred = clf.predict(x_test) # Mean absolute error regression loss mean_abs = sklearn.metrics.mean_absolute_error(y_test, y_pred) # Mean squared error regression loss mean_sq = sklearn.metrics.mean_squared_error(y_test, y_pred) # Median absolute error regression loss median_abs = sklearn.metrics.median_absolute_error(y_test, y_pred) # R^2 (coefficient of determination) regression score function r2 = sklearn.metrics.r2_score(y_test, y_pred) # Explained variance regression score function exp_var_score = sklearn.metrics.explained_variance_score(y_test, y_pred) # Optimal ridge regression alpha value from CV ridge_alpha = clf.alpha_ with open('../trained_networks/brr_%d_data.pkl' % n_features, 'wb') as results: pickle.dump(clf, results, pickle.HIGHEST_PROTOCOL) pickle.dump(mean_abs, results, pickle.HIGHEST_PROTOCOL) pickle.dump(mean_sq, results, pickle.HIGHEST_PROTOCOL) pickle.dump(median_abs, results, pickle.HIGHEST_PROTOCOL) pickle.dump(r2, results, pickle.HIGHEST_PROTOCOL) pickle.dump(exp_var_score, results, pickle.HIGHEST_PROTOCOL) pickle.dump(y_pred, results, pickle.HIGHEST_PROTOCOL) return
def sale(data): data = int(data) + 1 return log(data) dataset = pandas.read_csv("input/train2_.csv") testset = pandas.read_csv("input/test2_.csv") dataset['Sale'] = dataset['Sales'].apply(sale) labelData = dataset['Sale'].values myId = testset['Id'].values testset.drop(['Id'], inplace=True, axis=1) testData = testset.iloc[:, :].values dataset.drop(['Sales', 'Sale'], inplace=True, axis=1) dataData = dataset.iloc[:, :].values BRModel = BayesianRidge(compute_score=True) BRModel.fit(dataset.iloc[:, :].values, labelData) preds = numpy.column_stack((myId, BRModel.predict(testData))).tolist() preds = [[int(i[0])] + [exp(float(i[1])) - 1] for i in preds] print BRModel.scores_ with open("result/sub_BayesRidge.csv", "w") as output: writer = csv.writer(output, lineterminator='\n') writer.writerow(["Id", "Sales"]) writer.writerows(preds)
print (y_test[y_test==1] == y_test_predictions[y_test==1]).sum().astype(float) / y_test[y_test==1].shape[0] #0.875 #But, at what expense do we do this? To find out, use the following command: print (y_test_predictions == y_test).sum().astype(float) / y_test.shape[0] #0.967999 # Directly applying Bayesian ridge regression 贝叶斯岭回归 from sklearn.datasets import make_regression X, y = make_regression(1000, 10, n_informative=2, noise=20) #We can just "throw" ridge regression at the problem with a few simple steps: from sklearn.linear_model import BayesianRidge br = BayesianRidge() br.fit(X, y) print br.coef_ #array([0.3000136 , -0.33023408, 68.166673, -0.63228159, 0.07350987, #-0.90736606, 0.38851709, -0.8085291 , 0.97259451, 68.73538646]) br_alphas = BayesianRidge(alpha_1=10, lambda_1=10) br_alphas.fit(X, y) print br_alphas.coef_ #array([0.30054387, -0.33130025, 68.10432626, -0.63056712, #0.07751436, -0.90919326, 0.39020878, -0.80822013, #0.97497567, 68.67409658]) # Using boosting to learn from errors #Gradient boosting regression is a technique that learns from its mistakes.
def main(): usage = 'usage: %prog [options] <model_file>' parser = OptionParser(usage) parser.add_option('-c', dest='center_dist', default=10, type='int', help='Distance between the motifs and sequence center [Default: %default]') parser.add_option('-d', dest='model_hdf5_file', default=None, help='Pre-computed model output as HDF5 [Default: %default]') parser.add_option('-g', dest='cuda', default=False, action='store_true', help='Run on the GPGPU [Default: %default]') parser.add_option('-l', dest='seq_length', default=600, type='int', help='Sequence length [Default: %default]') parser.add_option('-o', dest='out_dir', default='heat', help='Output directory [Default: %default]') parser.add_option('-t', dest='targets', default='0', help='Comma-separated list of target indexes to plot (or -1 for all) [Default: %default]') (options,args) = parser.parse_args() if len(args) != 1: parser.error('Must provide Basset model file') else: model_file = args[0] out_targets = [int(ti) for ti in options.targets.split(',')] if not os.path.isdir(options.out_dir): os.mkdir(options.out_dir) random.seed(1) # torch options cuda_str = '' if options.cuda: cuda_str = '-cuda' ################################################################# # place filter consensus motifs ################################################################# # determine filter consensus motifs filter_consensus = get_filter_consensus(model_file, options.out_dir, cuda_str) seqs_1hot = [] num_filters = len(filter_consensus) # num_filters = 40 filter_len = filter_consensus[0].shape[1] # position the motifs left_i = options.seq_length/2 - options.center_dist - filter_len right_i = options.seq_length/2 + options.center_dist ns_1hot = np.zeros((4,options.seq_length)) + 0.25 # ns_1hot = np.zeros((4,options.seq_length)) # for i in range(options.seq_length): # nt_i = random.randint(0,3) # ns_1hot[nt_i,i] = 1 for i in range(num_filters): for j in range(num_filters): # copy the sequence of N's motifs_seq = np.copy(ns_1hot) # write them into the one hot coding motifs_seq[:,left_i:left_i+filter_len] = filter_consensus[i] motifs_seq[:,right_i:right_i+filter_len] = filter_consensus[j] # save seqs_1hot.append(motifs_seq) # make a full array seqs_1hot = np.array(seqs_1hot) # reshape for spatial seqs_1hot = seqs_1hot.reshape((seqs_1hot.shape[0],4,1,options.seq_length)) ################################################################# # place filter consensus motifs ################################################################# # save to HDF5 seqs_file = '%s/motif_seqs.h5' % options.out_dir h5f = h5py.File(seqs_file, 'w') h5f.create_dataset('test_in', data=seqs_1hot) h5f.close() # predict scores scores_file = '%s/motif_seqs_scores.h5' % options.out_dir torch_cmd = 'th basset_place2_predict.lua %s %s %s %s' % (cuda_str, model_file, seqs_file, scores_file) subprocess.call(torch_cmd, shell=True) # load in scores hdf5_in = h5py.File(scores_file, 'r') motif_seq_scores = np.array(hdf5_in['scores']) hdf5_in.close() ################################################################# # analyze ################################################################# for ti in out_targets: ################################################################# # compute pairwise expectations ################################################################# # X = np.zeros((motif_seq_scores.shape[0],num_filters)) # xi = 0 # for i in range(num_filters): # for j in range(num_filters): # X[xi,i] += 1 # X[xi,j] += 1 # xi += 1 X = np.zeros((motif_seq_scores.shape[0],2*num_filters)) xi = 0 for i in range(num_filters): for j in range(num_filters): X[xi,i] += 1 X[xi,num_filters+j] += 1 xi += 1 # fit model model = BayesianRidge() model.fit(X, motif_seq_scores[:,ti]) # predict pairwise expectations motif_seq_preds = model.predict(X) print model.score(X, motif_seq_scores[:,ti]) # print filter coefficients coef_out = open('%s/coefs_t%d.txt' % (options.out_dir,ti), 'w') for i in range(num_filters): print >> coef_out, '%3d %6.2f' % (i,model.coef_[i]) coef_out.close() ################################################################# # normalize pairwise predictions ################################################################# filter_interaction = np.zeros((num_filters,num_filters)) table_out = open('%s/table_t%d.txt' % (options.out_dir,ti), 'w') si = 0 for i in range(num_filters): for j in range(num_filters): filter_interaction[i,j] = motif_seq_scores[si,ti] - motif_seq_preds[si] cols = (i, j, motif_seq_scores[si,ti], motif_seq_preds[si], filter_interaction[i,j]) print >> table_out, '%3d %3d %6.3f %6.3f %6.3f' % cols si += 1 table_out.close() scores_abs = abs(filter_interaction.flatten()) max_score = stats.quantile(scores_abs, .999) print 'Limiting scores to +-%f' % max_score filter_interaction_max = np.zeros((num_filters, num_filters)) for i in range(num_filters): for j in range(num_filters): filter_interaction_max[i,j] = np.min([filter_interaction[i,j], max_score]) filter_interaction_max[i,j] = np.max([filter_interaction_max[i,j], -max_score]) # plot heat map plt.figure() sns.heatmap(filter_interaction_max, xticklabels=False, yticklabels=False) plt.savefig('%s/heat_t%d.pdf' % (options.out_dir,ti))
# Linear Regression print 'linear' lr = LinearRegression() #lr.fit(x[:, np.newaxis], y) #lr_sts_scores = lr.predict(xt[:, np.newaxis]) lr.fit(x, y) lr_sts_scores = lr.predict(xt) # Baysian Ridge Regression print 'baysian ridge' br = BayesianRidge(compute_score=True) #br.fit(x[:, np.newaxis], y) #br_sts_scores = br.predict(xt[:, np.newaxis]) br.fit(x, y) br_sts_scores = br.predict(xt) # Elastic Net print 'elastic net' enr = ElasticNet() #enr.fit(x[:, np.newaxis], y) #enr_sts_scores = enr.predict(xt[:, np.newaxis]) enr.fit(x, y) enr_sts_scores = enr.predict(xt) # Passive Aggressive Regression print 'passive aggressive' par = PassiveAggressiveRegressor()
def prediction_BayesianRidge (X_train, Y_train, X_test, Y_test,normalize): # Print shapes of the training and testing data sets #print ("Shapes of the training and testing data sets") #print(X_train.shape, X_test.shape, Y_train.shape, Y_test.shape) #Create our regression object lreg = BayesianRidge(normalize=normalize) #do a linear regression, except only on the training lreg.fit(X_train,Y_train) #print("The estimated intercept coefficient is %.2f " %lreg.intercept_) #print("The number of coefficients used was %d " % len(lreg.coef_)) # Set a DataFrame from the Facts coeff_df = DataFrame(X_train.columns) coeff_df.columns = ["Fact"] # Set a new column lining up the coefficients from the linear regression coeff_df["Coefficient"] = pd.Series(lreg.coef_) # Show #coeff_df #highest correlation between a fact and fraction votes #print ("Highest correlation fact: %s is %.9f" % (cf_dict.loc[coeff_df.iloc[coeff_df["Coefficient"].idxmax()]["Fact"],"description"], coeff_df.iloc[coeff_df["Coefficient"].idxmax()]["Coefficient"]) ) #sns_plot = sns.jointplot(coeff_df.iloc[coeff_df["Coefficient"].idxmax()]["Fact"],"Fraction Votes",pd.merge(X_test,pd.DataFrame(Y_test), right_index=True, left_index=True),kind="scatter") #Predictions on training and testing sets pred_train = lreg.predict(X_train) pred_test = lreg.predict(X_test) # The mean square error #print("MSE with X_train and Y_train: %.6f" % np.mean((Y_train - pred_train) ** 2)) #print("MSE with X_test and Y_test: %.6f" %np.mean((Y_test - pred_test) ** 2)) #Explained variance score: 1 is perfect prediction #print("Variance score: %.2f" % lreg.score(X_test, Y_test)) result={} result["method"]="BayesianRidge" if normalize : result["normalize"]="Y" else: result["normalize"]="N" result["X_train_shape"]=X_train.shape result["Y_train_shape"]=Y_train.shape result["X_test_shape"]=X_test.shape result["Y_test_shape"]=Y_test.shape result["intercept"]=lreg.intercept_ result["num_coef"]=len(lreg.coef_) result["max_fact"]=cf_dict.loc[coeff_df.iloc[coeff_df["Coefficient"].idxmax()]["Fact"],"description"] result["max_fact_value"]=coeff_df.iloc[coeff_df["Coefficient"].idxmax()]["Coefficient"] result["MSE_train"]=np.mean((Y_train - pred_train) ** 2) result["MSE_test"]=np.mean((Y_test - pred_test) ** 2) result["variance"]=lreg.score(X_test, Y_test) return pred_test,coeff_df,pred_train,result
runs = [] for _ in range(10): train_latent_matrix = get_latent_matrix(x,y,x) test_latent_matrix = get_latent_matrix(x,y,x_test) # Clean out rows with NaN. #mask = ~np.any(np.isnan(train_latent_matrix), axis=1) #newx = train_latent_matrix[mask] #newy = y[mask] newx = np.nan_to_num(train_latent_matrix) newy = y #last_layer = SVR(kernel='rbf', C=1e3, gamma=0.1) last_layer = BayesianRidge() last_layer.fit(newx, newy) output = last_layer.predict(test_latent_matrix) assert len(output) == 8500 runs.append(output) #for i in runs: #print len(i) fout = open('modelz.10.output', 'w') for line in zip(*runs): avg =sum(line)/len(line) if avg > 5: avg = 5.0 elif avg < 0: avg = 0.0
def main(): usage = "usage: %prog [options] <model_file>" parser = OptionParser(usage) parser.add_option( "-c", dest="center_dist", default=10, type="int", help="Distance between the motifs and sequence center [Default: %default]", ) parser.add_option( "-d", dest="model_hdf5_file", default=None, help="Pre-computed model output as HDF5 [Default: %default]" ) parser.add_option( "-g", dest="cuda", default=False, action="store_true", help="Run on the GPGPU [Default: %default]" ) parser.add_option("-l", dest="seq_length", default=600, type="int", help="Sequence length [Default: %default]") parser.add_option("-o", dest="out_dir", default="heat", help="Output directory [Default: %default]") parser.add_option( "-t", dest="targets", default="0", help="Comma-separated list of target indexes to plot (or -1 for all) [Default: %default]", ) (options, args) = parser.parse_args() if len(args) != 1: parser.error("Must provide Basset model file") else: model_file = args[0] out_targets = [int(ti) for ti in options.targets.split(",")] if not os.path.isdir(options.out_dir): os.mkdir(options.out_dir) random.seed(1) # torch options cuda_str = "" if options.cuda: cuda_str = "-cuda" ################################################################# # place filter consensus motifs ################################################################# # determine filter consensus motifs filter_consensus = get_filter_consensus(model_file, options.out_dir, cuda_str) seqs_1hot = [] # num_filters = len(filter_consensus) num_filters = 20 filter_len = filter_consensus[0].shape[1] # position the motifs left_i = options.seq_length / 2 - options.center_dist - filter_len right_i = options.seq_length / 2 + options.center_dist ns_1hot = np.zeros((4, options.seq_length)) + 0.25 # ns_1hot = np.zeros((4,options.seq_length)) # for i in range(options.seq_length): # nt_i = random.randint(0,3) # ns_1hot[nt_i,i] = 1 for i in range(num_filters): for j in range(num_filters): # copy the sequence of N's motifs_seq = np.copy(ns_1hot) # write them into the one hot coding motifs_seq[:, left_i : left_i + filter_len] = filter_consensus[i] motifs_seq[:, right_i : right_i + filter_len] = filter_consensus[j] # save seqs_1hot.append(motifs_seq) # make a full array seqs_1hot = np.array(seqs_1hot) # reshape for spatial seqs_1hot = seqs_1hot.reshape((seqs_1hot.shape[0], 4, 1, options.seq_length)) ################################################################# # place filter consensus motifs ################################################################# # save to HDF5 seqs_file = "%s/motif_seqs.h5" % options.out_dir h5f = h5py.File(seqs_file, "w") h5f.create_dataset("test_in", data=seqs_1hot) h5f.close() # predict scores scores_file = "%s/motif_seqs_scores.h5" % options.out_dir torch_cmd = "th basset_place2_predict.lua %s %s %s %s" % (cuda_str, model_file, seqs_file, scores_file) subprocess.call(torch_cmd, shell=True) # load in scores hdf5_in = h5py.File(scores_file, "r") motif_seq_scores = np.array(hdf5_in["scores"]) hdf5_in.close() ################################################################# # analyze ################################################################# for ti in out_targets: ################################################################# # compute pairwise expectations ################################################################# # X = np.zeros((motif_seq_scores.shape[0],num_filters)) # xi = 0 # for i in range(num_filters): # for j in range(num_filters): # X[xi,i] += 1 # X[xi,j] += 1 # xi += 1 X = np.zeros((motif_seq_scores.shape[0], 2 * num_filters)) xi = 0 for i in range(num_filters): for j in range(num_filters): X[xi, i] += 1 X[xi, num_filters + j] += 1 xi += 1 # fit model model = BayesianRidge() model.fit(X, motif_seq_scores[:, ti]) # predict pairwise expectations motif_seq_preds = model.predict(X) print model.score(X, motif_seq_scores[:, ti]) # print filter coefficients coef_out = open("%s/coefs_t%d.txt" % (options.out_dir, ti), "w") for i in range(num_filters): print >> coef_out, "%3d %6.2f" % (i, model.coef_[i]) coef_out.close() ################################################################# # normalize pairwise predictions ################################################################# filter_interaction = np.zeros((num_filters, num_filters)) table_out = open("%s/table_t%d.txt" % (options.out_dir, ti), "w") si = 0 for i in range(num_filters): for j in range(num_filters): filter_interaction[i, j] = motif_seq_scores[si, ti] - motif_seq_preds[si] cols = (i, j, motif_seq_scores[si, ti], motif_seq_preds[si], filter_interaction[i, j]) print >> table_out, "%3d %3d %6.3f %6.3f %6.3f" % cols si += 1 table_out.close() # plot heat map plt.figure() sns.heatmap(filter_interaction) plt.savefig("%s/heat_t%d.pdf" % (options.out_dir, ti))
lambda_ = 4. w = np.zeros(n_features) # Only keep 10 weights of interest relevant_features = np.random.randint(0, n_features, 10) for i in relevant_features: w[i] = stats.norm.rvs(loc=0, scale=1. / np.sqrt(lambda_)) # Create noise with a precision alpha of 50. alpha_ = 50. noise = stats.norm.rvs(loc=0, scale=1. / np.sqrt(alpha_), size=n_samples) # Create the target y = np.dot(X, w) + noise ############################################################################### # Fit the Bayesian Ridge Regression and an OLS for comparison clf = BayesianRidge(compute_score=True) clf.fit(X, y) ols = LinearRegression() ols.fit(X, y) ############################################################################### # Plot true weights, estimated weights, histogram of the weights, and # predictions with standard deviations lw = 2 plt.figure(figsize=(6, 5)) plt.title("Weights of the model") plt.plot(clf.coef_, color='lightgreen', linewidth=lw, label="Bayesian Ridge estimate") plt.plot(w, color='gold', linewidth=lw, label="Ground truth") plt.plot(ols.coef_, color='navy', linestyle='--', label="OLS estimate") plt.xlabel("Features")
def nickmain1(): train_all = pd.read_csv(trainloc) target_all = pd.read_csv(trainloc) test_all = pd.read_csv(testloc) targets = ['Ca','P','pH','SOC','Sand'] train_cols_to_remove = ['PIDN']+targets train_all["Depth"] = train_all["Depth"].replace(["Topsoil", "Subsoil"],[10,-10]) test_all["Depth"] = test_all["Depth"].replace(["Topsoil", "Subsoil"],[10,-10]) common_features = ['BSAN','BSAS','BSAV','CTI','ELEV','EVI','LSTD','LSTN','REF1','REF2','REF3','REF7','RELI','TMAP','TMFI'] feats_list = {} colnames_nums = [] colnames = train_all.ix[:,'m7497.96':'m599.76'].columns.values for x in colnames: match = re.search(r'(?<=m)[0-9]*',x) if match: colnames_nums.append(int(match.group())) print len(colnames) print len(colnames_nums) print len(train_all.ix[0,'m7497.96':'m599.76'].values) for target in targets: selector = SelectKBest(f_regression, k=200) selector.fit_transform(train_all.ix[:,'m7497.96':'m599.76'], train_all[target]) selected = selector.get_support() feats = [col for (col,sel) in zip(list(train_all.ix[:,'m7497.96':'m599.76'].columns.values), selected) if sel] feats_list[target] = feats+common_features #pickTest = ['PIDN', 'BSAN','BSAS','BSAV','CTI','ELEV','EVI','LSTD','LSTN','REF1','REF2','REF3','REF7','RELI','TMAP','TMFI','Depth']#ORIGINAL10 ids = np.genfromtxt(testloc, dtype=str, skip_header=1, delimiter=',', usecols=0) df = pd.DataFrame({"PIDN": ids, "Ca": test_all['PIDN'], "P": test_all['PIDN'], "pH": test_all['PIDN'], "SOC": test_all['PIDN'], "Sand": test_all['PIDN']}) cv = cross_validation.KFold(len(train_all), n_folds=10, indices=False) subresults = {} results = [] if issub == False: for train_sub, test_sub in cv: for target in targets: #clf = ensemble.GradientBoostingRegressor(n_estimators=6) #clf = RandomForestRegressor(n_estimators = 40) #clf = linear_model.Lasso(alpha=0.08) #clf = svm.SVC() #clf = tree.DecisionTreeRegressor(min_samples_leaf=20) #clf = Ridge(alpha=1.0) #clf = ElasticNet(alpha=0.1, l1_ratio=0.7) clf = BayesianRidge(compute_score=True) clf.fit(np.array(train_all[feats_list[target]])[train_sub], np.array(train_all[target])[train_sub]) pred = clf.predict(np.array(train_all[feats_list[target]])[test_sub]) subresults[target] = ev.rmse(np.array(train_all[target])[test_sub],np.array(pred)) #df[target] = pred subtotal = 0 for x in subresults: subtotal = subtotal + subresults[x] print ("average for the run is ", subtotal/len(targets)) results.append(subtotal/len(targets)) print "Results: " + str( np.array(results).mean() ) else: for target in targets: #clf = ensemble.GradientBoostingRegressor(n_estimators=6) #clf = RandomForestRegressor(n_estimators = 20) #clf = linear_model.Lasso(alpha=0.08) #clf = svm.SVC() #clf = tree.DecisionTreeRegressor(min_samples_leaf=20) #clf = Ridge(alpha=1.0) #clf = ElasticNet(alpha=0.1, l1_ratio=0.7) clf = BayesianRidge(compute_score=True) clf.fit(np.array(train_all[feats_list[target]]), np.array(train_all[target])) pred = clf.predict(np.array(test_all[feats_list[target]])) df[target] = pred df.to_csv(predloc, index=False, cols=["PIDN","Ca","P","pH","SOC","Sand"])
lambda_ = 4. w = np.zeros(n_features) # Only keep 10 weights of interest relevant_features = np.random.randint(0, n_features, 10) for i in relevant_features: w[i] = stats.norm.rvs(loc=0, scale=1. / np.sqrt(lambda_)) # Create noise with a precision alpha of 50. alpha_ = 50. noise = stats.norm.rvs(loc=0, scale=1. / np.sqrt(alpha_), size=n_samples) # Create the target y = np.dot(X, w) + noise ############################################################################### # Fit the Bayesian Ridge Regression and an OLS for comparison clf = BayesianRidge(compute_score=True) clf.fit(X, y) ols = LinearRegression() ols.fit(X, y) ############################################################################### # Plot true weights, estimated weights and histogram of the weights pl.figure(figsize=(6, 5)) pl.title("Weights of the model") pl.plot(clf.coef_, 'b-', label="Bayesian Ridge estimate") pl.plot(w, 'g-', label="Ground truth") pl.plot(ols.coef_, 'r--', label="OLS estimate") pl.xlabel("Features") pl.ylabel("Values of the weights") pl.legend(loc="best", prop=dict(size=12))
df = pd.concat(frames, axis=0, ignore_index=True) ### Imputing DYAR train = df[(df.DYAR.isnull() ==False) & (df.pct_team_tgts.isnull() == False)] train.reset_index(inplace=True, drop=True) test = df[(df.DYAR.isnull() == True) & (df.pct_team_tgts.isnull() == False)] test.reset_index(inplace= True, drop=True) features = ['targets', 'receptions', 'rec_tds', 'start_ratio', 'pct_team_tgts', 'pct_team_receptions', 'pct_team_touchdowns', 'rec_yards', 'dpi_yards', 'fumbles', 'first_down_ctchs', 'pct_of_team_passyards'] X = scale(train[features]) y = train.DYAR # Our best model for predicting DYAR was a Bayesian Ridge Regressor br = BayesianRidge() br.fit(X,y) dyar_predictions = pd.DataFrame(br.predict(scale(test[features])), columns = ['DYAR_predicts']) test = test.join(dyar_predictions) test['DYAR'] = test['DYAR_predicts'] test.drop('DYAR_predicts', inplace=True, axis=1) frames = [train,test] df = pd.concat(frames, axis=0, ignore_index=True) ### Imputing EYds train = df[(df.EYds.isnull() ==False) & (df.pct_team_tgts.isnull() == False)] train.reset_index(inplace=True, drop=True) test = df[(df.EYds.isnull() == True) & (df.pct_team_tgts.isnull() == False)] test.reset_index(inplace= True, drop=True)
trainingcounts = counts[100:] testcounts = counts[:100] trainingrates = countrates[100:] testrates = countrates[:100] trainingtimes = times[100:] testtimes = times[:100] # using trainingcounts and training hists use log linear #poisson_model = sm.GLM(trainingrates, # sm.tools.tools.add_constant(traininghists), # family =sm.families.Poisson(sm.genmod.families.links.log)) #results = poisson_model.fit() #print(results.summary()) #x = results.predict(sm.tools.tools.add_constant(testhists)) clf = BayesianRidge(compute_score=True) clf.fit(traininghists,trainingrates) x = clf.predict(testhists) answer = testrates plt.plot(bins,x) plt.plot(bins,answer) plt.show()
def do_validation(data_path, steps=10): allfiles = initialize(data_path) gbm = GradientBoostingRegressor(n_estimators=100, learning_rate=0.05, max_depth=6, min_samples_leaf=5, subsample=0.5) ada = AdaBoostRegressor(n_estimators=200, learning_rate=1) etree = ExtraTreesRegressor(n_estimators=200, n_jobs=-1, min_samples_leaf=5) rf = RandomForestRegressor(n_estimators=200, max_features=4, min_samples_leaf=5) kn = KNeighborsRegressor(n_neighbors=25) logit = LogisticRegression(tol=0.05) enet = ElasticNetCV(l1_ratio=0.75, max_iter=1000, tol=0.05) svr = SVR(kernel="linear", probability=True) ridge = Ridge(alpha=18) bridge = BayesianRidge(n_iter=500) gbm_metrics = 0.0 ada_metrics = 0.0 etree_metrics = 0.0 rf_metrics = 0.0 kn_metrics = 0.0 logit_metrics = 0.0 svr_metrics = 0.0 ridge_metrics = 0.0 bridge_metrics = 0.0 enet_metrics = 0.0 nnet_metrics = 0.0 logistic = LogisticRegression() rbm = BernoulliRBM(random_state=0, verbose=True) classifier = Pipeline(steps=[('rbm', rbm), ('logistic', logistic)]) for i in xrange(steps): driver = allfiles[i] df, Y = create_merged_dataset(driver) df['label'] = Y # Shuffle DF. df = df.reindex(np.random.permutation(df.index)) train = df[:100] label = train['label'] del train['label'] test = df[100:400] Y = test['label'] del test['label'] #to_drop = ['driver', 'trip', 'speed1', 'speed2', 'speed3', 'speed4', 'speed5', 'speed6', 'speed7', 'speed8', 'speed9', # 'speed10', 'speed11', 'speed12', 'speed13', 'speed14', 'speed15', 'speed16', 'speed17', 'speed18', 'speed19', # 'speed20', 'speed21', 'speed22', 'speed23', 'speed24', 'speed25', 'speed26', 'speed27', 'speed28', 'speed29', # 'speed30', 'speed31', 'speed32', 'speed33', 'speed34', 'speed35', 'speed36', 'speed37', 'speed38', 'speed39', # 'speed40', 'speed41', 'speed42', 'speed43', 'speed44', 'speed45', 'speed46', 'speed47', 'speed48', 'speed49', # 'speed50', 'speed51', 'speed52', 'speed53', 'speed54', 'speed55', 'speed56', 'speed57', 'speed58', 'speed59', # 'speed60', 'speed61', 'speed62', 'speed63', 'speed64', 'speed65', 'speed66', 'speed67', 'speed68', 'speed69', # 'speed70', 'speed71', 'speed72', 'speed73', 'speed74', 'speed75', 'speed76', 'speed77', 'speed78', 'speed79', 'speed80'] to_drop = ['driver', 'trip'] X_train = train.drop(to_drop, 1) X_test = test.drop(to_drop, 1) gbm.fit(X_train, label) Y_hat = gbm.predict(X_test) fpr, tpr, thresholds = metrics.roc_curve(Y, Y_hat) gbm_metrics += metrics.auc(fpr, tpr) ada.fit(X_train, label) Y_hat = ada.predict(X_test) fpr, tpr, thresholds = metrics.roc_curve(Y, Y_hat) ada_metrics += metrics.auc(fpr, tpr) etree.fit(X_train, label) Y_hat = etree.predict(X_test) fpr, tpr, thresholds = metrics.roc_curve(Y, Y_hat) etree_metrics += metrics.auc(fpr, tpr) rf.fit(X_train, label) Y_hat = rf.predict(X_test) fpr, tpr, thresholds = metrics.roc_curve(Y, Y_hat) rf_metrics += metrics.auc(fpr, tpr) kn.fit(X_train, label) Y_hat = kn.predict(X_test) fpr, tpr, thresholds = metrics.roc_curve(Y, Y_hat) kn_metrics += metrics.auc(fpr, tpr) # Linear models. to_drop = ['driver', 'trip', 'distance', 'sd_acceleration', 'final_angle', 'mean_acceleration', 'mean_avg_speed', 'sd_inst_speed', 'sd_avg_speed', 'mean_inst_speed', 'points'] X_train = train.drop(to_drop, 1) X_test = test.drop(to_drop, 1) logit.fit(X_train, label) Y_hat = [i[1] for i in logit.predict_proba(X_test)] fpr, tpr, thresholds = metrics.roc_curve(Y, Y_hat) logit_metrics += metrics.auc(fpr, tpr) svr.fit(X_train, label) Y_hat = svr.predict(X_test) fpr, tpr, thresholds = metrics.roc_curve(Y, Y_hat) svr_metrics += metrics.auc(fpr, tpr) ridge.fit(X_train, label) Y_hat = ridge.predict(X_test) fpr, tpr, thresholds = metrics.roc_curve(Y, Y_hat) ridge_metrics += metrics.auc(fpr, tpr) bridge.fit(X_train, label) Y_hat = bridge.predict(X_test) fpr, tpr, thresholds = metrics.roc_curve(Y, Y_hat) bridge_metrics += metrics.auc(fpr, tpr) enet.fit(X_train, label) Y_hat = enet.predict(X_test) fpr, tpr, thresholds = metrics.roc_curve(Y, Y_hat) enet_metrics += metrics.auc(fpr, tpr) classifier.fit(X_train, label) Y_hat = classifier.predict(X_test) fpr, tpr, thresholds = metrics.roc_curve(Y, Y_hat) nnet_metrics += metrics.auc(fpr, tpr) print "" print "GBM:", gbm_metrics/steps print "AdaBoost:", ada_metrics/steps print "Extra Trees:", etree_metrics/steps print "RF:", rf_metrics/steps print "KN:", kn_metrics/steps print "" print "Logit:", logit_metrics/steps print "SVR:", svr_metrics/steps print "Ridge:", ridge_metrics/steps print "BayesianRidge:", bridge_metrics/steps print "Elastic Net:", enet_metrics/steps print "Neural Networks:", nnet_metrics/steps print ""
#sc = supervised_clustering.SupervisedClusteringRegressor(clf, connectivity=A, # n_iterations=30, verbose=1, n_jobs=8, # cv=ShuffleSplit(X_train.shape[0], n_splits=10, test_fraction=0.6, # random_state=0)) t1 = time() sc.fit(X_train, y_train) sc_time = time() -t1 computed_coefs = sc.inverse_transform() computed_coefs = np.reshape(computed_coefs, [size, size, size]) score = sc.score(X_test, y_test) ############################################################################### # Compute the results for simple BayesianRidge t1 = time() clf.fit(X_train, y_train) bayes_time = time() - t1 bayes_coefs = clf.coef_ bayes_score = clf.score(X_test, y_test) bayes_coefs = bayes_coefs.reshape((size, size, size)) ############################################################################### # Plot the results pl.close('all') pl.figure() pl.title('Scores of the supervised clustering') pl.subplot(2, 1, 1) pl.plot(np.arange(len(sc.scores_)), sc.scores_) pl.xlabel('score')
def main(): usage = 'usage: %prog [options] <repr_hdf5> <data_hdf5> <target_index>' parser = OptionParser(usage) parser.add_option('-a', dest='add_only', default=False, action='store_true', help='Use additional features only; no sequence features') parser.add_option('-b', dest='balance', default=False, action='store_true', help='Downsample the negative set to balance [Default: %default]') parser.add_option('-o', dest='out_dir', default='postmodel', help='Output directory [Default: %default]') parser.add_option('-r', dest='regression', default=False, action='store_true', help='Regression mode [Default: %default]') parser.add_option('-s', dest='seq_only', default=False, action='store_true', help='Use sequence features only; no additional features [Default: %default]') parser.add_option('--sample', dest='sample', default=None, type='int', help='Sample from the training set [Default: %default]') parser.add_option('-t', dest='target_hdf5', default=None, help='Extract targets from this HDF5 rather than data_hdf5 argument') parser.add_option('-x', dest='regex_add', default=None, help='Filter additional features using a comma-separated list of regular expressions') (options,args) = parser.parse_args() if len(args) != 3: parser.error('Must provide full data HDF5, representation HDF5, and target index or filename') else: repr_hdf5_file = args[0] data_hdf5_file = args[1] target_i = args[2] if not os.path.isdir(options.out_dir): os.mkdir(options.out_dir) random.seed(1) ####################################################### # preprocessing ####################################################### # load training targets data_hdf5_in = h5py.File(data_hdf5_file, 'r') if options.target_hdf5: target_hdf5_in = h5py.File(options.target_hdf5, 'r') else: target_hdf5_in = data_hdf5_in train_y = np.array(target_hdf5_in['train_out'])[:,target_i] test_y = np.array(target_hdf5_in['test_out'])[:,target_i] # load training representations if not options.add_only: repr_hdf5_in = h5py.File(repr_hdf5_file, 'r') train_x = np.array(repr_hdf5_in['train_repr']) test_x = np.array(repr_hdf5_in['test_repr']) repr_hdf5_in.close() if options.seq_only: add_labels = [] else: # load additional features train_a = np.array(data_hdf5_in['train_add']) test_a = np.array(data_hdf5_in['test_add']) add_labels = np.array(data_hdf5_in['add_labels']) if options.regex_add: fi = filter_regex(options.regex_add, add_labels) train_a, test_a, add_labels = train_a[:,fi], test_a[:,fi], add_labels[fi] # append additional features if options.add_only: add_i = 0 train_x, test_x = train_a, test_a else: add_i = train_x.shape[1] train_x = np.concatenate((train_x,train_a), axis=1) test_x = np.concatenate((test_x,test_a), axis=1) data_hdf5_in.close() if options.target_hdf5: target_hdf5_in.close() # balance if options.balance: train_x, train_y = balance(train_x, train_y) # sample if options.sample is not None and options.sample < train_x.shape[0]: sample_indexes = random.sample(range(train_x.shape[0]), options.sample) train_x = train_x[sample_indexes] train_y = train_y[sample_indexes] ####################################################### # model ####################################################### if options.regression: # fit model = BayesianRidge(fit_intercept=True) model.fit(train_x, train_y) # accuracy acc_out = open('%s/r2.txt' % options.out_dir, 'w') print >> acc_out, model.score(test_x, test_y) acc_out.close() test_preds = model.predict(test_x) # plot a sample of predictions versus actual plt.figure() sns.jointplot(test_preds[:5000], test_y[:5000], joint_kws={'alpha':0.3}) plt.savefig('%s/scatter.pdf' % options.out_dir) plt.close() # plot the distribution of residuals plt.figure() sns.distplot(test_y-test_preds) plt.savefig('%s/residuals.pdf' % options.out_dir) plt.close() else: # fit model = LogisticRegression(penalty='l2', C=1000) model.fit(train_x, train_y) # accuracy test_preds = model.predict_proba(test_x)[:,1].flatten() acc_out = open('%s/auc.txt' % options.out_dir, 'w') print >> acc_out, roc_auc_score(test_y, test_preds) acc_out.close() # compute and print ROC curve fpr, tpr, thresholds = roc_curve(test_y, test_preds) roc_out = open('%s/roc.txt' % options.out_dir, 'w') for i in range(len(fpr)): print >> roc_out, '%f\t%f\t%f' % (fpr[i], tpr[i], thresholds[i]) roc_out.close() # compute and print precision-recall curve precision, recall, thresholds = precision_recall_curve(test_y, test_preds) prc_out = open('%s/prc.txt' % options.out_dir, 'w') for i in range(len(precision)): print >> prc_out, '%f\t%f' % (precision[i], recall[i]) prc_out.close() # save model joblib.dump(model, '%s/model.pkl' % options.out_dir) ####################################################### # analyze ####################################################### # print coefficients table coef_out = open('%s/add_coefs.txt' % options.out_dir, 'w') for ai in range(len(add_labels)): if options.regression: coefi = model.coef_[add_i+ai] else: coefi = model.coef_[0,add_i+ai] print >> coef_out, add_labels[ai], coefi coef_out.close()
y2 = svr.predict(x_test_scaled) kr = KernelRidge(alpha=0.0001, coef0=1, degree=1, gamma=0.001, kernel='rbf',kernel_params=None) kr.fit(x_train_scaled, y_train) y3 = kr.predict(x_test_scaled) lasso = Lasso(alpha=1e-09) lasso.fit(x_train_scaled, y_train) y4 = lasso.predict(x_test_scaled) linear_ridge = Ridge(alpha=0.1) linear_ridge.fit(x_train_scaled,y_train) y5 = linear_ridge.predict(x_test_scaled) bayesian_ridge = BayesianRidge(alpha_1=1e-05, alpha_2=10, lambda_1=10, lambda_2=1e-05) bayesian_ridge.fit(x_train_scaled, y_train) y6 = bayesian_ridge.predict(x_test_scaled) sgd = SGDRegressor(alpha=0.1, epsilon=0.001, l1_ratio=0.2, loss='squared_loss', penalty='none', power_t=0.2) sgd.fit(x_train_scaled, y_train) y7 = sgd.predict(x_test_scaled) ########################################### print '########## TESTING ERRORS ##########' print "MAE for Linear Regression:", mean_absolute_error(y_test, y_predicted) print "MAE for SVR:", mean_absolute_error(y_test, y2) print "MAE for Kernel Ridge Regression:", mean_absolute_error(y_test, y3) print "MAE for Lasso Regression:", mean_absolute_error(y_test, y4) print "MAE for Linear Ridge Regression:", mean_absolute_error(y_test, y5) print "MAE for Bayesian Ridge Regression:", mean_absolute_error(y_test, y6)
def main(): parser = argparse.ArgumentParser(description="""Creates embeddings predictions.""") parser.add_argument('--train') parser.add_argument('--test') parser.add_argument('--embeddings') parser.add_argument('--cv',default=False) args = parser.parse_args() stoplist = stopwords.words("english") stoplist.extend("it's 've 's i'm he's she's you're we're they're i'll you'll he'll ".split(" ")) embeddings={} for line in codecs.open(args.embeddings,encoding="utf-8").readlines(): line = line.strip() if line: a= line.split(" ") embeddings[a[0]] = np.array([float(v) for v in a[1:]]) #cast to float, otherwise we cannot operate train_indices = [] test_indices = [] train_scores = [] train_features = [] test_features = [] # if args.learner == "logisticregression": # learner= LogisticRegression() # learner_type = "classification" # elif args.learner == "decisiontreeclassification": # learner = tree.DecisionTreeClassifier() # learner_type = "classification" # elif args.learner == "decisiontreeregression": # learner = tree.DecisionTreeRegressor() # learner_type = "regression" # elif args.learner == "bayesianridge": # learner = BayesianRidge() # learner_type = "regression" # else: learner = BayesianRidge() learner_type = "regression" le = preprocessing.LabelEncoder() for line in open(args.train).readlines(): (index, score, tweet) = line.strip().split("\t") train_indices.append(index) train_scores.append(float(score)) tweet = tweet.split(" ") train_features.append(embedfeats(tweet,embeddings,stoplist)) train_indices = np.array(train_indices) train_scores = np.array(train_scores) train_features = np.array(train_features) train_scores_int = [roundup(v) for v in train_scores] le.fit(train_scores_int) train_scores_int_transformed = le.transform(train_scores_int) if args.cv: train_cv={} cross=cross_validation.KFold(len(train_scores),n_folds=10) acc=[] for train_index, test_index in cross: #if args.debug: # print("TRAIN:", len(train_index), "TEST:", len(test_index)) X=train_features y=train_scores X_train, X_test = X[train_index], X[test_index] y_train, y_test = y[train_index], y[test_index] learner.fit(X_train,y_train) y_pred= learner.predict(X_test) assert(len(y_pred)==len(test_index)) tids=train_indices[test_index] for twid,pred in zip(tids,y_pred): train_cv[twid] = pred acc.append(cosine_similarity(y_test,y_pred)[0][0]) print >>sys.stderr, "Cosine of 10-folds:", acc print >>sys.stderr, "Macro average:", np.mean(np.array(acc)), np.std(np.array(acc)) for twid in train_indices: print "{}\t{}".format(twid,train_cv[twid]) else: for line in open(args.test).readlines(): (index, score, tweet) = line.strip().split("\t") test_indices.append(index) #scores.append(score) tweet = tweet.split(" ") test_features.append(embedfeats(tweet,embeddings,stoplist)) #print np.array(train_features).shape # when features are generated, train and test if learner_type == "regression": learner.fit(train_features,train_scores) else: learner.fit(train_features,train_scores_int_transformed) predicted_scores= learner.predict(test_features) if learner_type != "regression": predicted_scores = le.inverse_transform(predicted_scores) for index, score in zip(test_indices,predicted_scores): print index+"\t"+str(score)
def bayes_ridge_reg(x_data,y_data): br = BayesianRidge() br.fit(x_data,y_data) print 'br params',br.coef_,br.intercept_ adjusted_result = br.predict(x_data) return map(int,list(adjusted_result))
if pd.isnull(row['Age']): for key in avg_age.keys(): if key in row['Name']: tt.loc[index,"Age"] = avg_age[key] #-------------------------------------------------------------------------------- # X = td.loc[:,['Sex','Age', 'Fare','SibSp','Parch','Pclass']].values X = np.where(np.isnan(X), -1, X) X_ = tt.loc[:,['Sex','Age', 'Fare','SibSp','Parch', 'Pclass']].values X_ = np.where(np.isnan(X_), -1, X_) Y = td['Survived'].values clf = BayesianRidge(lambda_1=10**-4, lambda_2=10**-4, alpha_1=10**2.75,alpha_2=10**3.3, compute_score=True) #0.78947 model = clf.fit(X, Y) #Result predict_result = model.predict(X_).round(0).astype(int) result = pd.DataFrame.from_items([('PassengerId',tt['PassengerId']), ('Survived',predict_result)]) result.to_csv('result/bayes_result.csv', index=False) t1 = pd.read_csv("result/elastic_result_077512.csv") t2 = pd.read_csv("result/bayes_result.csv") t3 = t1 == t2 i = 0 for index, row in t3.iterrows(): if row['Survived'] == False: i += 1