# Isaac Li # 1.23.2018 import time import numpy as np from sklearn.model_selection import KFold from sklearn.metrics import mean_squared_error import function train, test = function.read_file(path='a') train["血糖"] = np.log1p(train["血糖"]) train, test = function.add_column(train, test) train, test = function.transform(train, test) print('\n\nStart...') t0, mses = time.time(), [] train_preds, test_preds = np.zeros(train.shape[0]), np.zeros((test.shape[0], 5)) predictors = [f for f in test.columns if f not in ['血糖']] kf = KFold(n_splits=5, shuffle=True, random_state=520) for i, (train_index, test_index) in enumerate(kf.split(train)): print(' .{}/5.'.format(i + 1)) train_feat1, train_feat2 = train.iloc[train_index], train.iloc[test_index] gbm = function.settings.model_lgb.fit(train_feat1[predictors], train_feat1['血糖'], categorical_feature=['性别', '体检日期']) predict = gbm.predict(train_feat2[predictors]) train_preds[test_index] += predict mses.append(.5 * mean_squared_error(np.expm1(train_feat2['血糖']), np.expm1(predict))) test_preds[:, i] = gbm.predict(test[predictors]) cv = .5 * mean_squared_error(np.expm1(train['血糖']), np.expm1(train_preds))
ans_path = function.settings.source_path + 'd_answer_a_20180128.csv' ans = pd.read_csv(ans_path, encoding='gbk') # NOTICE: add a row in file as index! # ------------ predict a ----------------------------------------------------------------------------------------------- a = input('Part A? -> ') if a: print('Done.') else: print('Part A.') train, test = train_a, test_a train["血糖"] = np.log1p(train["血糖"]) if not s: list_test = function.settings.all_items + ['其他胆固醇'] train, test = function.add_column(train, test, test_item=list_test[3:]) else: train, test = function.add_column(train, test) train, test = function.transform(train, test) print('\n\nStart...') t0, mses = time.time(), [] train_preds, test_preds = np.zeros(train.shape[0]), np.zeros( (test.shape[0], 5)) predictors = [f for f in test.columns if f not in ['血糖']] kf = KFold(n_splits=5, shuffle=True, random_state=520) for i, (train_index, test_index) in enumerate(kf.split(train)): print(' .{}/5.'.format(i + 1)) train_feat1, train_feat2 = train.iloc[train_index], train.iloc[ test_index]
# Isaac Li # 1.23.2018 import time import numpy as np from sklearn.model_selection import KFold from sklearn.metrics import mean_squared_error import function train, test = function.read_file() train["血糖"] = np.log1p(train["血糖"]) train, test = function.add_column(train, test, sqrt=True) train, test = function.transform(train, test) print('\n\nStart...') t0, mses = time.time(), [] train_preds, test_preds = np.zeros(train.shape[0]), np.zeros( (test.shape[0], 5)) predictors = [f for f in test.columns if f not in ['血糖']] kf = KFold(n_splits=5, shuffle=True, random_state=520) for i, (train_index, test_index) in enumerate(kf.split(train)): print(' .{}/5.'.format(i + 1)) train_feat1, train_feat2 = train.iloc[train_index], train.iloc[test_index] gbm = function.settings.model_lgb.fit(train_feat1[predictors], train_feat1['血糖'], categorical_feature=['性别', '体检日期']) predict = gbm.predict(train_feat2[predictors]) train_preds[test_index] += predict mses.append( .5 *
# Isaac Li # 1.23.2018 import time import numpy as np from sklearn.model_selection import KFold from sklearn.metrics import mean_squared_error import function train, test = function.read_file(path='s') train["血糖"] = np.log1p(train["血糖"]) train, test = function.add_column(train, test, test_item=function.settings.all_items[3:] + ['其他胆固醇']) train, test = function.transform(train, test) print('\n\nStart...') t0, mses = time.time(), [] train_preds, test_preds = np.zeros(train.shape[0]), np.zeros( (test.shape[0], 5)) predictors = [f for f in test.columns if f not in ['血糖']] kf = KFold(n_splits=5, shuffle=True, random_state=520) for i, (train_index, test_index) in enumerate(kf.split(train)): print(' .{}/5.'.format(i + 1)) train_feat1, train_feat2 = train.iloc[train_index], train.iloc[test_index] gbm = function.settings.model_lgb.fit(train_feat1[predictors], train_feat1['血糖'], categorical_feature=['性别', '体检日期']) predict = gbm.predict(train_feat2[predictors])
# Isaac Li # 1.23.2018 import time import numpy as np from sklearn.model_selection import KFold from sklearn.metrics import mean_squared_error import function train, test = function.read_file() train["血糖"] = np.log1p(train["血糖"]) train, test = function.add_column(train, test, sqrt=True, power3_a=True) train, test = function.transform(train, test) print('\n\nStart...') t0, mses = time.time(), [] train_preds, test_preds = np.zeros(train.shape[0]), np.zeros( (test.shape[0], 5)) predictors = [f for f in test.columns if f not in ['血糖']] kf = KFold(n_splits=5, shuffle=True, random_state=520) for i, (train_index, test_index) in enumerate(kf.split(train)): print(' .{}/5.'.format(i + 1)) train_feat1, train_feat2 = train.iloc[train_index], train.iloc[test_index] gbm = function.settings.model_lgb.fit(train_feat1[predictors], train_feat1['血糖'], categorical_feature=['性别', '体检日期']) predict = gbm.predict(train_feat2[predictors]) base, power, minimum = 1.7, 1, 7