def compute_score_for_mytest(each_df): each_df['chart_cos'] = np.cos(each_df['CHARTTIME'] / 1440) each_df['chart_sin'] = np.sin(each_df['CHARTTIME'] / 1440) each_df['timediff'] = each_df['CHARTTIME'] - each_df['CHARTTIME_last'] each_df.set_index(['name', 'index'], drop=True, inplace=True) each_df_fillna = pd.concat( Parallel(n_jobs=-1)(delayed(fillna)(each_group) for name, each_group in each_df.groupby('name'))) for each_column in use_columns: # load模型 each_test_x = each_df[each_df[each_column].notna()] each_max_min = each_df.groupby('name')[each_column].agg([max, min]) if each_column in ['HCT', 'HGB']: each_model = joblib.load(os.path.join('lgb统计', each_column)) each_test_x = each_df.drop(each_column, axis=1) else: each_model = joblib.load(os.path.join('lgb统计加fillna', each_column)) each_test_x = each_df_fillna.drop(each_column, axis=1) ###生成mytest mytest_index = Parallel(n_jobs=1)( delayed(split_my_val2)(group) for name, group in each_test_x.groupby('name')) each_test_x = each_test_x.loc[mytest_index] each_test_y = each_df.loc[mytest_index][each_column] each_test_x[each_column] = each_model.predict(each_test_x) # 这里需要恢复原来的值计算score if each_column in ['PBUN', 'PGLU', 'WBC', 'PLT']: each_test_x[each_column] = inv_boxcox1p(each_test_x[each_column], 0) elif each_column in ['PCRE']: each_test_x[each_column] = inv_boxcox1p(each_test_x[each_column], -1.5) mse = np.sum( np.square((each_test_y - each_test_x[each_column]) / (each_max_min['max'] - each_max_min['min']))) score = np.sqrt(mse / each_test_x.shape[0]) print(each_column, score) return score
def cal_loss(y_pre, d_train): global X_train2, X_val2, select_col ##需要判断此时是训练集还是验证集来决定取var_max和var_min y_label = d_train.get_label() if select_col in ['PBUN', 'PGLU', 'WBC', 'PLT']: y_pre = inv_boxcox1p(y_pre, 0) y_label = inv_boxcox1p(y_label, 0) elif select_col in ['PCRE']: y_pre = inv_boxcox1p(y_pre, -1.5) y_label = inv_boxcox1p(y_label, -1.5) if len(y_pre) == X_train2.shape[0]: mse = (y_pre - y_label) / (X_train2['var_max'] - X_train2['var_min']) elif len(y_pre) == X_val2.shape[0]: mse = (y_pre - y_label) / (X_val2['var_max'] - X_val2['var_min']) # grad = y_pre - y_label hess = np.power(np.abs(mse), 0.5) return mse, hess
def test_for_predict(each_dir, each_name): # 生成特征 each_df = read_data_and_extract_features(each_dir, each_name) to_each_df = pd.DataFrame() each_df['chart_cos'] = np.cos(each_df['CHARTTIME'] / 1440) each_df['chart_sin'] = np.sin(each_df['CHARTTIME'] / 1440) each_df['timediff'] = each_df['CHARTTIME'] - each_df['CHARTTIME_last'] each_df.set_index(['name', 'index'], drop=True, inplace=True) each_df_fillna = pd.concat( Parallel(n_jobs=-1)(delayed(fillna)(each_group) for name, each_group in each_df.groupby('name'))) for each_column in use_columns: # 保留预测位置 y_index = each_df[each_column].isna() each_test_x = each_df.drop(each_column, axis=1) # if each_column in ['HCT', 'HGB']: # each_model = joblib.load(os.path.join('lgb统计', each_column)) # each_test_x = each_df.drop(each_column, axis=1) # else: each_model = joblib.load(os.path.join('lgb统计加fillna', each_column)) each_test_x = each_df_fillna.drop(each_column, axis=1) each_test_x = each_test_x[y_index] each_test_not_x = each_df.drop(each_test_x.index) each_test_x[each_column] = each_model.predict(each_test_x) # 这里需要恢复原来的值计算score if each_column in ['PBUN', 'PGLU', 'WBC', 'PLT']: each_test_x[each_column] = inv_boxcox1p(each_test_x[each_column], 0) elif each_column in ['PCRE']: each_test_x[each_column] = inv_boxcox1p(each_test_x[each_column], -1.5) each_test_x = pd.concat([each_test_not_x, each_test_x]) to_each_df[each_column] = each_test_x[each_column] if not os.path.exists('fillna_test_data'): os.mkdir('fillna_test_data') to_each_df['CHARTTIME'] = each_df['CHARTTIME'] to_each_df.sort_values(['index'], inplace=True) to_each_df.to_csv(os.path.join('fillna_test_data', each_name), index=None)
def cal_val(y_pre, d_train): global X_train2, X_val2, select_col ##不管是训练集还是验证集都是用训练集每个人的最大值和最小值 y_label = d_train.get_label().values if select_col in ['PBUN', 'PGLU', 'WBC', 'PLT']: y_pre = inv_boxcox1p(y_pre, 0) y_label = inv_boxcox1p(y_label, 0) elif select_col in ['PCRE']: y_pre = inv_boxcox1p(y_pre, -1.5) y_label = inv_boxcox1p(y_label, -1.5) if len(y_pre) == X_val2.shape[0]: mse = np.sum( np.square((y_label - y_pre) / (X_val2['max'] - X_val2['min']))) score = np.sqrt(mse / len(y_pre)) elif len(y_pre) == X_train2.shape[0]: mse = np.sum( np.square((y_label - y_pre) / (X_train2['max'] - X_train2['min']))) score = np.sqrt(mse / len(y_pre)) return 't1', score, False
def test_inv_boxcox(): x = np.array([0., 1., 2.]) lam = np.array([0., 1., 2.]) y = boxcox(x, lam) x2 = inv_boxcox(y, lam) assert_almost_equal(x, x2) x = np.array([0., 1., 2.]) lam = np.array([0., 1., 2.]) y = boxcox1p(x, lam) x2 = inv_boxcox1p(y, lam) assert_almost_equal(x, x2)
def predict_for_column(each_df, each_df_fillna, each_column): # 保留测试集index y_index = each_df[each_column].isna() # if each_column in ['HCT', 'HGB']: # each_model = joblib.load(os.path.join('lgb统计', each_column)) # each_test_x = each_df.drop(each_column, axis=1) # else: each_model = joblib.load(os.path.join('lgb统计加fillna', each_column)) each_test_x = each_df_fillna.drop(each_column, axis=1) each_test_x = each_test_x[y_index] each_test_not_x = each_df.drop(each_test_x.index) each_test_x[each_column] = each_model.predict(each_test_x) # 这里需要恢复原来的值计算score if each_column in ['PBUN', 'PGLU', 'WBC', 'PLT']: each_test_x[each_column] = inv_boxcox1p(each_test_x[each_column], 0) elif each_column in ['PCRE']: each_test_x[each_column] = inv_boxcox1p(each_test_x[each_column], -1.5) each_test_x = pd.concat([each_test_not_x, each_test_x]) return each_test_x[each_column]
def generate(x, y, filename): """Generate fixture data and write to file. # Arguments * `x`: domain * `y`: domain * `name::str`: filename of the output file # Examples ```python python> x = np.linspace(-10.0, 10.0, 2001) python> y = np.linspace(-5.0, 5.0, 1001) python> generate(x, y, './data.json') ``` """ z = inv_boxcox1p(x, y) data = dict(x=x.tolist(), y=y.tolist(), expected=z.tolist()) filepath = path.join(DIR, filename) with open(filepath, 'w') as out: json.dump(data, out)
df_test = df_test_middle.drop(columns='daysOnMarket') df_test_label = df_test_middle['daysOnMarket'] value_list = [] for i in range(len(data.columns)): value_list.append('categorical') column_description1 = { key: value for key in data.columns for value in value_list if data[key].dtype == 'object' } column_description2 = { 'daysOnMarket': 'output', 'buildingTypeId': 'categorical' } print(column_description1) column_descriptions = dict(column_description1, **column_description2) ml_predictor = Predictor(type_of_estimator='regressor', column_descriptions=column_descriptions) ml_predictor.train(df_train, model_names='XGBRegressr') # ml_predictor.score(df_test) x = ml_predictor.predict(df_test) print( mean_absolute_error(inv_boxcox1p(df_test_label, 0.15), inv_boxcox1p(x, 0.15))) print(mean_absolute_error(df_test_label, x))
def test_inv_boxcox1p_underflow(): x = 1e-15 lam = 1e-306 y = inv_boxcox1p(x, lam) assert_allclose(y, x, rtol=1e-14)
test.isnull().sum() test["runtime"] = test["runtime"].fillna(test["runtime"].mean()) test["status"] = test["status"].fillna(test["status"].mode()[0]) test["release_dayofweek"] = test["release_dayofweek"].fillna( test["release_dayofweek"].mode()[0]) test["release_quarter"] = test["release_quarter"].fillna( test["release_quarter"].mode()[0]) sns.heatmap(test.isnull()) X_test = test.drop(["id"], axis=1) X_train.columns X_test.columns ###### Building model import xgboost from sklearn.metrics import accuracy_score predictor = xgboost.XGBRegressor() predictor.fit(X_train, y_train) pred_train = predictor.predict(X_train) pred_test = predictor.predict(X_test) pred_test_original = inv_boxcox1p(pred_test, 0.2) PP = pd.concat([test.id], axis=1) PP["revenue"] = pred_test_original PP.head() PP.to_csv("TMDB1stTry.csv", index=False)
cat_pred_train = model_cat.predict(train1.values) cat_pred_train[cat_pred_train < 0] = 0 print("Mean square logarithmic error of cat model on whole train = {:.4f}". format(msle(y_train, cat_pred_train))) # In[45]: c = np.array([0.333334, 0.333333, 0.333333]) print("The sum of the entries of c is {}".format(c.sum())) train_pred = xgb_pred_train * c[0] + lgb_pred_train * c[ 1] + cat_pred_train * c[2] print("Mean square logarithmic error of chosen model on whole train = {:.4f}". format(msle(y_train, train_pred))) # In[47]: lgb_pred = model_lgb.predict(test) xgb_pred = model_xgb.predict(test.values) cat_pred = model_cat.predict(test) # In[48]: #将结果写入表格 pred = inv_boxcox1p((xgb_pred * c[0] + lgb_pred * c[1] + cat_pred * c[2]), 0.2) sub = pd.DataFrame({"id": np.arange(test.shape[0]) + 3001, "revenue": pred}) sub.to_csv("C:/Users/jynkris/Desktop/sample_submission.csv", index=False)
def saveSubmission(iDs, preds): sub = pd.DataFrame() sub['Id'] = inv_boxcox1p(iDs, lda).apply(lambda x: round(x)) sub['SalePrice'] = inv_boxcox1p(preds, lda) sub.to_csv('submission_stacked.csv', index=False)
from scipy import stats, special import pandas as pd import numpy as np data = pd.read('data.csv') y = data.target lam_range = np.linspace(-2, 5, 100) # default nums=50 llf = np.zeros(lam_range.shape, dtype=float) # lambda estimate: for i, lam in enumerate(lam_range): llf[i] = stats.boxcox_llf(lam, y) # y 必须>0 # find the max lgo-likelihood(llf) index and decide the lambda lam_best = lam_range[llf.argmax()] #对预测变量进行cox-box变换 y_boxcox = special.boxcox1p(y, lam_best) #对预测变量进行逆cox-box变换 y_invboxcox = special.inv_boxcox1p(y_boxcox, lam_best)
def predict(): ''' For rendering results on HTML GUI ''' int_features = [21132,1231,4121,1214,123,412,12414,512,115,155] int_features = [x for x in request.form.values()] from scipy.special import boxcox1p lam = 0.15 int_features[0] = np.int64(int_features[0]) int_features[0] = boxcox1p(int_features[0], lam) int_features[1] = np.int64(int_features[1]) int_features[1] = boxcox1p(int_features[1], lam) int_features[2] = np.int64(int_features[2]) int_features[2] = boxcox1p(int_features[2], lam) int_features[3] = np.int64(int_features[3]) int_features[3] = boxcox1p(int_features[3], lam) int_features[4] = np.int64(int_features[4]) int_features[4] = boxcox1p(int_features[4], lam) int_features[5] = np.int64(int_features[5]) int_features[5] = boxcox1p(int_features[5], lam) int_features[6] = np.int64(int_features[6]) int_features[6] = boxcox1p(int_features[6], lam) int_features[7] = np.int64(int_features[7]) int_features[7] = boxcox1p(int_features[7], lam) int_features[8] = np.int64(int_features[8]) int_features[8] = boxcox1p(int_features[8], lam) int_features[9] = np.int64(int_features[9]) int_features[9] = boxcox1p(int_features[9], lam) from scipy.special import inv_boxcox1p columns = ['MSSubClass', 'MSZoning', 'LotFrontage', 'LotArea', 'Street', 'Alley', 'LotShape', 'LandContour', 'Utilities', 'LotConfig', 'LandSlope', 'Neighborhood', 'Condition1', 'Condition2', 'BldgType', 'HouseStyle', 'OverallQual', 'OverallCond', 'YearBuilt', 'YearRemodAdd', 'RoofStyle', 'RoofMatl', 'Exterior1st', 'Exterior2nd', 'MasVnrType', 'MasVnrArea', 'ExterQual', 'ExterCond', 'Foundation', 'BsmtQual', 'BsmtCond', 'BsmtExposure', 'BsmtFinType1', 'BsmtFinSF1', 'BsmtFinType2', 'BsmtFinSF2', 'BsmtUnfSF', 'TotalBsmtSF', 'Heating', 'HeatingQC', 'CentralAir', 'Electrical', '1stFlrSF', '2ndFlrSF', 'LowQualFinSF', 'GrLivArea', 'BsmtFullBath', 'BsmtHalfBath', 'FullBath', 'HalfBath', 'BedroomAbvGr', 'KitchenAbvGr', 'KitchenQual', 'TotRmsAbvGrd', 'Functional', 'Fireplaces', 'FireplaceQu', 'GarageType', 'GarageYrBlt', 'GarageFinish', 'GarageCars', 'GarageArea', 'GarageQual', 'GarageCond', 'PavedDrive', 'WoodDeckSF', 'OpenPorchSF', 'EnclosedPorch', '3SsnPorch', 'ScreenPorch', 'PoolArea', 'PoolQC', 'Fence', 'MiscFeature', 'MiscVal', 'MoSold', 'YrSold', 'SaleType', 'SaleCondition', 'TotalSF'] values = [7.990963041593332, 1.8203341036428238, 9.125735246126716, 35.391370879389704,0.7304631471189666, 0.7304631471189666, 1.5409627556327752, 1.5409627556327752, 0.0005003172240540867, 1.4013368444274956, 0.0431549546952863, 2.9929161754443663, int_features[5], 1.1948967456559554, 0.24556737057378078, 1.4155057023841355, int_features[1], 2.1604465664352563, 14.13734536216387, int_features[7], 0.8932636402684723, 0.7538300024576454, 2.7582173064497746, 2.844430978246579, 1.071057748915512, 3.268986443449214, 1.3624422931139548, 1.7328990595033906, 0.8834994399659468, 1.2470924388496651, 1.4569085852270438, 1.2062523036053971, 1.317433970123405, 7.084892889342716, 1.9657590475261213, 1.0517430422748202, 9.391255388699905, 11.89254623970312, 0.7446911405592987, 0.7563474000353817, 0.6829330108338099, 1.6815016774004494, int_features[4], 4.917140353780343, 0.15794652317139848, int_features[2], 0.30751236837651724, 0.04166142948306093, int_features[6], 0.2774860242213367, 1.4742074483529004, 0.7517241594265581, 1.2746341259746379,int_features[8] , 2.1887574652310366, 0.42455642997342163, 1.3031568396814526, 1.140331159374495, 14.151202776364704, 0.7722995531965653, int_features[3],int_features[9], 1.774185368855956, 1.7865149945239338, 1.1111640610733957, 3.726756270282964, 3.253621458595589, 1.0382729368834795, 0.13072845037668754, 0.6225990043864975, 0.050872777658061384, 1.1920461692064572, 1.131509564697584, 1.1922835973693042, 0.40283910651921423, 2.230572790696048, 14.195035682494952, 2.478799472550466, 1.7025947724500126, int_features[0]] xdd=dict(zip(columns, values)) df_s = pd.DataFrame(columns=columns) df_s = df_s.append(xdd, ignore_index=True) prediction = model.predict(df_s) print(prediction) prediction = inv_boxcox1p(prediction, 0.15) #addiiton to look realistic prediction = inv_boxcox1p(prediction, 0.15) print(prediction) output = round(prediction[0], 2) return render_template('index.html', prediction_text='House price should be $ {}'.format(output))
def train_val_test(each_df, each_column, save_dir): global X_train2, X_val2, select_col select_col = each_column # use_columns_temp=use_columns.copy() # use_columns_temp.remove(each_column) # combined_df=pd.DataFrame() # for each_i,i in enumerate(use_columns_temp): # combined_df[i+'**2']=each_df[i]**2 # combined_df[i+'**0.5']=each_df[i]**0.5 # #加减乘都不用做两次,除可以 # for each_j in range(each_i+1,len(use_columns)): # j=use_columns[each_j] # combined_df[i+'+'+j]=each_df[i]+each_df[j] # combined_df[i+'-'+j]=each_df[i]-each_df[j] # combined_df[i+'*'+j]=each_df[i]*each_df[j] # for i in use_columns: # #加减乘都不用做两次,除可以 # for j in use_columns: # if i!=j: # combined_df[i+'/'+j]=each_df[i]/each_df[j] # each_df=pd.concat([each_df,combined_df],axis=1) each_df.reset_index(drop=True, inplace=True) each_df.sort_values(['name', 'index'], inplace=True) each_df['chart_cos'] = np.cos(each_df['CHARTTIME'] / 1440) each_df['chart_sin'] = np.sin(each_df['CHARTTIME'] / 1440) each_df['timediff'] = each_df['CHARTTIME'] - each_df['CHARTTIME_last'] if each_column in ['PBUN', 'PGLU', 'WBC', 'PLT']: each_df[each_column] = boxcox1p(each_df[each_column], 0) elif each_column in ['PCRE']: each_df[each_column] = boxcox1p(each_df[each_column], -1.5) each_df_copy = each_df.drop([each_column], axis=1) each_max_min = naidx_testinfo[naidx_testinfo['column_name'] == each_column] # fillna # 在fillna前删除异常值 if not each_column in ['HCT', 'HGB']: each_df_copy = pd.concat( Parallel(n_jobs=-1)( delayed(fillna)(each_group) for name, each_group in each_df_copy.groupby('name'))) each_df_copy.set_index(['name', 'index'], drop=True, inplace=True) each_max_min.set_index(['name', 'index'], drop=True, inplace=True) each_df.set_index(['name', 'index'], drop=True, inplace=True) each_test = pd.merge(each_df_copy, each_max_min, left_index=True, right_index=True) each_train_x = each_df_copy.loc[each_df_copy.index.drop(each_test.index)] each_train_y = each_df.loc[each_train_x.index, each_column] each_test_x = each_test[each_train_x.columns] each_train_y = each_train_y.dropna() each_train_x = each_train_x.loc[each_train_y.index] X_train, X_val, y_train, y_val = train_test_split(each_train_x, each_train_y, test_size=0.3, random_state=2019) # val_index=Parallel(n_jobs=1)(delayed(split_my_val2)(group)for name,group in each_train_x.groupby('name')) # X_val=each_train_x.loc[val_index] # y_val=each_train_y.loc[val_index] # X_train=each_train_x.drop(val_index) # y_train=each_train_y.loc[X_train.index] train_max_min = each_train_y.groupby('name').agg([max, min]) # 需要恢复 if each_column in ['PBUN', 'PGLU', 'WBC', 'PLT']: train_max_min['max'] = inv_boxcox1p(train_max_min['max'], 0) train_max_min['min'] = inv_boxcox1p(train_max_min['min'], 0) elif each_column in ['PCRE']: train_max_min['max'] = inv_boxcox1p(train_max_min['max'], -1.5) train_max_min['min'] = inv_boxcox1p(train_max_min['min'], -1.5) X_train2 = pd.merge(X_train, train_max_min, left_index=True, right_index=True) X_val2 = pd.merge(X_val, train_max_min, left_index=True, right_index=True) # 创建成lgb特征的数据集格式 lgb_train = lgb.Dataset(X_train, y_train) # 将数据保存到LightGBM二进制文件将使加载更快 lgb_eval = lgb.Dataset(X_val, y_val) # 创建验证数据 params = { 'learning_rate': 0.05, 'boosting_type': 'gbdt', 'objective': 'regression', 'metric': ['mse'], 'colsample_bytree': 0.9, 'subsample': 0.9, 'num_leaves': 30, # 叶子节点个数 'min_data': 50, # 每个叶子节点最少样本数 'max_depth': -1, # 树深度 'lambda_l2': 0.001, # l2正则 'lambda_l1': 0.01, # l1正则 'num_threads': 12, 'verbose': -1, 'tree_learner': 'voting', 'seed': 2019 } # 训练 cv and train gbm = lgb.train(params, lgb_train, valid_sets=[lgb_train, lgb_eval], feval=cal_val, num_boost_round=3000, early_stopping_rounds=100, verbose_eval=100) each_test['pred'] = gbm.predict(each_test[each_train_x.columns]) if not os.path.exists(save_dir): os.mkdir(save_dir) joblib.dump(gbm, os.path.join(save_dir, each_column)) # 这里需要恢复原来的值计算score if each_column in ['PBUN', 'PGLU', 'WBC', 'PLT']: each_test['pred'] = inv_boxcox1p(each_test['pred'], 0) elif each_column in ['PCRE']: each_test['pred'] = inv_boxcox1p(each_test['pred'], -1.5) mse = np.sum( np.square((each_test['var_value'] - each_test['pred']) / (each_test['var_max'] - each_test['var_min']))) score = np.sqrt(mse / each_test.shape[0]) return score