def fit_predict(self, train, val=None, test=None, **kwa): model = QuantReg(train[1], train[0]).fit(q=0.5, max_iter=10000) if val is None: return model.predict(test[0]) else: return model.predict(val[0]), model.predict(test[0])
def test_fitted_residuals(): data = sm.datasets.engel.load_pandas().data y, X = dmatrices('foodexp ~ income', data, return_type='dataframe') res = QuantReg(y, X).fit(q=.1) # Note: maxabs relative error with fitted is 1.789e-09 assert_almost_equal(np.array(res.fittedvalues), Rquantreg.fittedvalues, 5) assert_almost_equal(np.array(res.predict()), Rquantreg.fittedvalues, 5) assert_almost_equal(np.array(res.resid), Rquantreg.residuals, 5)
def train_LAD(x, y): """ 训练LAD线性回归模型,并返回模型预测值 """ X = sm.add_constant(x) model = QuantReg(y, X) model = model.fit(q=0.5) re = model.predict(X) return re
class SkQuantReg: def __init__(self, tau): self.tau = tau def fit(self, X, y): self.m = QuantReg(y, X).fit(self.tau) return self def predict(self, X): return self.m.predict(X)
def train_predict_stacking_linear_regression(df_learning, df_prod, l_tuple_strategy_normalised): for quantile in constants.LIST_QUANTILE: to_keep = [] for strategy, normalize_by in l_tuple_strategy_normalised: str_normalized = '_normed_by_' + normalize_by if normalize_by is not None else '' to_keep.append('{}{}_quantile_{:.3f}'.format( strategy, str_normalized, quantile)) # Remove NA columns to_keep = df_learning[to_keep].notnull().all() to_keep = to_keep[to_keep].index.tolist() # We need to remove constants columns from the sampled data df_learning_weighted = df_learning.sample(10000, weights='weight', replace=True, random_state=1) # Remove constants columns cols_constants = df_learning_weighted[to_keep].std() == 0 cols_constants = cols_constants[cols_constants].index.tolist() for col in cols_constants: to_keep.remove(col) # # Remove correlated features # # Create correlation matrix # corr_matrix = df_learning[to_keep].corr().abs().fillna(1) # # Select upper triangle of correlation matrix # upper = corr_matrix.where(np.triu(np.ones(corr_matrix.shape), k=1).astype(np.bool)) # # Find index of feature columns with correlation greater than 0.95 # to_drop = [column for column in upper.columns if any(upper[column] > 0.95)] # to_keep.remove(to_drop) # Drop duplicates columns def getDuplicateColumns(df): ''' Get a list of duplicate columns. It will iterate over all the columns in dataframe and find the columns whose contents are duplicate. :param df: Dataframe object :return: List of columns whose contents are duplicates. ''' duplicateColumnNames = set() # Iterate over all the columns in dataframe for x in range(df.shape[1]): # Select column at xth index. col = df.iloc[:, x] # Iterate over all the columns in DataFrame from (x+1)th index till end for y in range(x + 1, df.shape[1]): # Select column at yth index. otherCol = df.iloc[:, y] # Check if two columns at x 7 y index are equal if col.equals(otherCol): duplicateColumnNames.add(df.columns.values[y]) return list(duplicateColumnNames) cols_duplicate = getDuplicateColumns(df_learning_weighted[to_keep]) for cols in cols_duplicate: to_keep.remove(cols) # to_keep = df_learning_weighted[to_keep].T.drop_duplicates().T.columns # Not efficient but ok X_learning_weighted = df_learning_weighted[to_keep].fillna(0) X_learning = df_learning[to_keep].fillna(0) X_prod = df_prod[to_keep].fillna(0) y_learning_weighted = df_learning_weighted['sales'] # weight_learning = df_learning['weight'] if X_learning_weighted.nunique().max() != 1: linear_model = QuantReg(y_learning_weighted, X_learning_weighted) linear_model = linear_model.fit(q=quantile) # print(linear_model.summary()) df_learning['quantile_{:.3f}'.format( quantile)] = linear_model.predict(X_learning) df_prod['quantile_{:.3f}'.format(quantile)] = linear_model.predict( X_prod) else: df_learning['quantile_{:.3f}'.format(quantile)] = 0 df_prod['quantile_{:.3f}'.format(quantile)] = 0 return df_learning, df_prod
def train_predict_lgb_tweedie(df_learning, df_prod, verbose_eval=75): """ Args : - df_learning - df_prod Returns: - df_valid with quantile prediction and pinball loss - df_prod with quantile prediction """ ( df_learning, df_train, df_valid, df_valid_oof, X_learning, X_train, X_valid, X_valid_oof, X_prod, y_learning, y_train, y_valid, y_valid_oof, weight_learning, weight_train, weight_valid, weight_valid_oof, lgb_learning, lgb_train, lgb_valid, ) = prepare_data(df_learning, df_prod) param, num_boost_round, early_stopping_rounds = get_lgb_params( objective='tweedie', dataset_nrows=df_learning.shape[0]) col_predict = 'pred' df_learning_pred, df_valid_pred, df_valid_oof, df_prod = train_predict_lgb( df_learning, df_valid, X_learning, X_valid, df_valid_oof, df_prod, X_valid_oof, X_prod, lgb_train, lgb_valid, lgb_learning, param, num_boost_round, early_stopping_rounds, verbose_eval, col_predict) from statsmodels.regression.quantile_regression import QuantReg df_learning_weighted = df_learning.sample(100000, weights='weight', replace=True) to_keep = ['pred', 'horizon'] X_learning_weighted = df_learning_weighted[to_keep] X_learning = df_learning[to_keep] X_valid_oof = df_valid_oof[to_keep] X_prod = df_prod[to_keep] # y_learning = df_learning['sales'] y_learning_weighted = df_learning_weighted['sales'] for quantile in constants.LIST_QUANTILE: # QuantReg do not have weight parameter, so we mannualy reweight our datasets linear_model = QuantReg(y_learning_weighted, X_learning_weighted) linear_model = linear_model.fit(q=quantile) # print(linear_model.summary()) df_learning['quantile_{:.3f}'.format(quantile)] = linear_model.predict( X_learning) df_valid_oof['quantile_{:.3f}'.format( quantile)] = linear_model.predict(X_valid_oof) df_prod['quantile_{:.3f}'.format(quantile)] = linear_model.predict( X_prod) df_valid_oof = prep.compute_pinball(df_valid_oof) return df_valid_oof, df_prod
def train_predict_lgb_point_to_uncertainity(df_learning, df_prod, verbose_eval): """ Args : - df_learning - df_prod Returns: - df_valid with quantile prediction and pinball loss - df_prod with quantile prediction """ ( df_learning, df_train, df_valid, df_valid_oof, X_learning, X_train, X_valid, X_valid_oof, X_prod, y_learning, y_train, y_valid, y_valid_oof, weight_learning, weight_train, weight_valid, weight_valid_oof, lgb_learning, lgb_train, lgb_valid, ) = prepare_data(df_learning, df_prod) param, num_boost_round, early_stopping_rounds = get_lgb_params( objective='regression', dataset_nrows=df_learning.shape[0]) col_predict = 'pred' df_learning_pred, df_valid_pred, df_valid_oof, df_prod = train_predict_lgb( df_learning, df_valid, X_learning, X_valid, df_valid_oof, df_prod, X_valid_oof, X_prod, lgb_train, lgb_valid, lgb_learning, param, num_boost_round, early_stopping_rounds, verbose_eval, col_predict) df_learning_weighted = pd.concat([df_valid_oof, df_valid_pred]).sample(100000, weights='weight', replace=True, random_state=1) # If we fit QuantReg on overfitted prediction, QuantReg underestimate the security needed # df_learning_weighted = df_learning.sample(100000, weights='weight', replace=True, random_state=1) to_keep = ['pred', 'horizon'] X_learning_weighted = df_learning_weighted[to_keep] X_learning = df_learning[to_keep] X_valid_oof = df_valid_oof[to_keep] X_prod = df_prod[to_keep] # y_learning = df_learning['sales'] y_learning_weighted = df_learning_weighted['sales'] for quantile in constants.LIST_QUANTILE: # QuantReg do not have weight parameter, so we mannualy reweight our datasets linear_model = QuantReg(y_learning_weighted, X_learning_weighted) linear_model = linear_model.fit(q=quantile) # print(linear_model.summary()) df_learning['quantile_{:.3f}'.format(quantile)] = linear_model.predict( X_learning) df_valid_oof['quantile_{:.3f}'.format( quantile)] = linear_model.predict(X_valid_oof) df_prod['quantile_{:.3f}'.format(quantile)] = linear_model.predict( X_prod) df_valid_oof = prep.compute_pinball(df_valid_oof) return df_valid_oof, df_prod
train_y = Dataset.load_part('train', 'loss') train_x = pd.read_csv('preds/%s-train.csv' % pred_name)['loss'].values orig_maes = [] corr_maes = [] for fold, (fold_train_idx, fold_eval_idx) in enumerate( KFold(len(train_y), n_folds, shuffle=True, random_state=2016)): fold_train_x = train_x[fold_train_idx] fold_train_y = train_y[fold_train_idx] fold_eval_x = train_x[fold_eval_idx] fold_eval_y = train_y[fold_eval_idx] model = QuantReg(fold_train_y, fold_train_x).fit(q=0.5) fold_eval_p = model.predict(fold_eval_x) orig_maes.append(mean_absolute_error(fold_eval_y, fold_eval_x)) corr_maes.append(mean_absolute_error(fold_eval_y, fold_eval_p)) print("Fold %d, orig MAE = %.5f, corr MAE = %.5f" % (fold, orig_maes[-1], corr_maes[-1])) print() print("Avg orig MAE = %.5f" % np.mean(orig_maes)) print("Avg corr MAE = %.5f" % np.mean(corr_maes)) print("Done.")
class ForecastModelQR(ForecastModelBase): """ QR预报模型 """ def constructModel(self): """ QR比较特殊,无需构造模型,或者说它构造模型和训练是同时完成的,所以实现均在fit()方法中 :return: """ pass def fit(self): optimizedHyperParameters = self.optimizedHyperParameters fixedHyperParameters = self.fixedHyperParameters kernelName = optimizedHyperParameters["kernelName"] trainX, trainY, validationX, validationY = self.dataset.getDataset(2) self.model = QuantReg(trainY, trainX) def predict(self, validationX=None, isFlatten=False): if validationX is None: validationX = self.dataset.validationX optimizedHyperParameters = self.optimizedHyperParameters kernelName = optimizedHyperParameters["kernelName"] results = self.model.fit(q=0.5, kernel=kernelName) predictions = self.model.predict(params=results.params, exog=validationX) if isFlatten: predictions = predictions.flatten() self.dataset.validationD = predictions return predictions def getOptimizedHyperParametersRange(self): optimizedHyperParametersRange = { "kernelName": hp.choice("kernelName", ['epa', 'cos', 'gau', 'par']), } return optimizedHyperParametersRange def getDefaultOptimizedHyperParameters(self): optimizedHyperParameters = dict() # 核函数名称 optimizedHyperParameters["kernelName"] = "epa" return optimizedHyperParameters def getDefaultFixedHyperParameters(self): fixedHyperParameters = dict() return fixedHyperParameters def getProbabilisticResults(self, probabilisticForecastModelParams, validationX=None): if validationX is None: validationX = self.dataset.validationX validSampleNum = validationX.shape[0] optimizedHyperParameters = self.optimizedHyperParameters kernelName = optimizedHyperParameters["kernelName"] # 刚好从0到1步长0.001,也恰好是1001个点 F = np.arange(0, 1.001, 0.001) predictions = np.zeros(shape=(validSampleNum, len(F))) for i in range(len(F)): q = F[i] if 0 < q < 1: results = self.model.fit(q=q, kernel=kernelName) prediction = self.model.predict(params=results.params, exog=validationX) predictions[:, i] = prediction.T predictions[:, 0] = 2 * predictions[:, 1] - predictions[:, 2] predictions[:, -1] = 2 * predictions[:, -2] - predictions[:, -3] predictions.sort(axis=1) pdfs = [] cdfs = [] for i in range(validSampleNum): # 刚好从0到1步长0.001,也恰好是1001个点 x = predictions[i, :] x = self.dataset.reverseLabel(x) c = dict() c["x"] = x c["F"] = F cdfs.append(c) # 已知概率密度函数PDF去求累计分布函数CDF,这是确定的过程 # 已知CDF反求PDF,在PDF形式未知的情况下,根据所求方法采用的假设不同得到的PDF不同 # 用面积定义来求,假设在散点很密的情况下,可以简化为小梯形面积或者小矩形面积,但这个假设不同会导致PDF形式差别很大 # 也可以根据CDF分布来随机生成很多样本,再采用核密度估计方法也能得到PDF,总之取决于假设 # 方法1:面积定义来求,假设小矩形,这个过程中推荐方法1 xNew = np.linspace(x.min(), x.max(), len(x)) y = MathInterpolateUtils.interp1d(x, F, xNew, kind="slinear") f = np.zeros(shape=x.shape) for j in range(1, len(f)): f[j] = (y[j] - y[j - 1]) / (xNew[j] - xNew[j - 1]) x = xNew # 方法2:面积定义法,假设小梯形 # f = np.zeros(shape=x.shape) # for j in range(1, len(F)): # f[j] = 2 * (F[j] - F[j - 1]) / (x[j] - x[j - 1]) - f[j - 1] # 方法3:核密度估计 # 首先需要针对CDF产生均匀分布的随机数,由于计算过程中分位数已经是均匀分布的了,所以可以直接对对应的x值进行估计 # 方法3很费时,除了展示个别时段的PDF,整个过程中基本都在用CDF而不是PDF,所以在这个过程中不建议采用方法3 # 只在专门展示PDF的服务里使用这个方法 # paramGrid = {'bandwidth': np.arange(0, 5, 0.5)} # kde = KernelDensity(kernel='epanechnikov') # kdeGrid = GridSearchCV(estimator=kde, param_grid=paramGrid, cv=3) # kde = kdeGrid.fit(x.reshape(-1, 1)).best_estimator_ # logDens = kde.score_samples(x.reshape(-1, 1)) # f = np.exp(logDens) p = dict() p["x"] = x p["f"] = f pdfs.append(p) probabilisticResults = { "pdfs": np.array(pdfs), "cdfs": np.array(cdfs) } self.dataset.validationP = probabilisticResults return probabilisticResults