def gridSearchBidPrice(self, y_prob, slotprices): print("=== Get best bid prices") #avg_ctr = ClickEvaluator().compute_avgCTR(self.Y_train) avg_ctr = 0.00075 # use fixed ctr from full train set print("Train avgCTR = {}".format(avg_ctr)) bid_estimator = BidEstimator() # TODO: could add option for alternate bid strats best_pred_thresh, best_base_bid, perf_df = bid_estimator.gridSearch_bidPrice(y_prob, avg_ctr, slotprices,self.gold_val,bidpriceest_model='linearBidPrice') ipinyouWriter.ResultWriter().writeResult(re.sub('.csv','-linearBidPrice.csv',self.bids_tuning_perf_filepath), perf_df) # best_pred_thresh, best_base_bid, perf_df = bid_estimator.gridSearch_bidPrice(y_prob, avg_ctr, slotprices,self.gold_val,bidpriceest_model='linearBidPrice_variation') ipinyouWriter.ResultWriter().writeResult(re.sub('.csv','-linearBidPrice_variation.csv',self.bids_tuning_perf_filepath), perf_df) return best_pred_thresh,best_base_bid
def exeEnsemble_v1(trainDF, targetDF, trainPath, validationPath, targetPath, writeResult2CSV=False): xg_y_pred = exeXGBoostBidModel(validationData=targetDF, trainData=trainDF, writeResult2CSV=False) cnn_y_pred = exeCNNBidModel(validationDataPath=validationPath, trainDataPath=trainset, testDataPath=targetPath, writeResult2CSV=False) # fm_y_pred = exeFM_SGDBidModel(validationDataOneHot=validateDFonehot, trainDataOneHot=trainDFonehot, validationData=validateDF, writeResult2CSV=True) # Use XG's 0 when its threshold is below 0.75. y_pred = [0 if xg < 0.75 else cnn for xg, cnn in zip(xg_y_pred, cnn_y_pred)] # Use CNN's 1 when its threshold is above 0.2? prune_thresh = 0.2 be = BidEstimator() bidprice = be.linearBidPrice_mConfi(y_pred, 230, 100, prune_thresh) # bidprice = be.linearBidPrice_variation(y_pred, 80, 0.2, slotprices=slotprices, prune_thresh=prune_thresh) bids = np.stack([targetDF['bidid'], bidprice], axis=1) bids = pd.DataFrame(bids, columns=['bidid', 'bidprice']) if writeResult2CSV: ipinyouWriter.ResultWriter().writeResult("resultEnsemble_v1.csv", bids) myEvaluator = Evaluator.Evaluator() myEvaluator.computePerformanceMetricsDF(6250*1000, bids, targetDF) # Force CNN result to 1 and 0 for F1 score y_pred = [1 if i >= prune_thresh else 0 for i in y_pred] ce = Evaluator.ClickEvaluator() ce.printClickPredictionScore(y_pred, targetDF)
def exeLogisticRegressionBidModel_v2(validationReader=None, trainReader=None, writeResult2CSV=False): print("============ LogisticRegressionBidModel_v2") trainOneHotData, trainY = trainReader.getOneHotData() validationOneHotData, valY = validationReader.getOneHotData( train_cols=trainOneHotData.columns.get_values().tolist()) X_train = trainOneHotData Y_train = trainY['click'] X_val = validationOneHotData Y_val = valY['click'] lbm = LinearBidModel_v2(cBudget=110, avgCTR=0.2) lbm.trainModel(X_train, Y_train) # lbm.gridSearchandCrossValidate(X_train, Y_train) # print (validationReader.getDataFrame().info()) v_df = validationReader.getDataFrame() y_pred, bids = lbm.getBidPrice(X_val, v_df) if writeResult2CSV: ipinyouWriter.ResultWriter().writeResult("resultLogisticRegressionBidModel.csv", bids) myEvaluator = Evaluator() myEvaluator.computePerformanceMetricsDF(6250 * 1000, bids, v_df) myEvaluator.printResult() return y_pred
def exeGaussianRandomBidModel(validationData, trainData=None, writeResult2CSV=False): # gaussian random Bidding Model randomBidModel = BidModels.GaussianRandomBidModel() bids = randomBidModel.getBidPrice(validationData.bidid) # bids = np.apply_along_axis(randomBidModel.getBidPrice, axis=1, arr=validationData.getTestData()) if writeResult2CSV: ipinyouWriter.ResultWriter().writeResult("resultGaussianRandomBidModel.csv", bids) # myEvaluator = Evaluator.Evaluator(25000*1000, bids, validationData.getTrainData()) # myEvaluator.computePerformanceMetrics() myEvaluator = Evaluator() myEvaluator.computePerformanceMetricsDF(6250 * 1000, bids, validationData) myEvaluator.printResult()
def exeUniformRandomBidModel(validationData, trainData=None, writeResult2CSV=False): # uniform random Bidding Model randomBidModel = BidModels.UniformRandomBidModel(300) #upper bound for random bidding range # TODO: could train this too in a range. bids = randomBidModel.getBidPrice(validationData.bidid) # bids = np.apply_along_axis(randomBidModel.getBidPrice, axis=1, arr=validationData.getTestData()) if writeResult2CSV: ipinyouWriter.ResultWriter().writeResult("resultUniformRandomBidModel.csv", bids) # myEvaluator = Evaluator.Evaluator(25000*1000, bids, validationData.getTrainData()) # myEvaluator.computePerformanceMetrics() myEvaluator = Evaluator() myEvaluator.computePerformanceMetricsDF(6250 * 1000, bids, validationData) myEvaluator.printResult()
def exeConstantBidModel(validationData, trainData=None, train=False, writeResult2CSV=False): # Constant Bidding Model constantBidModel = BidModels.ConstantBidModel(defaultbid=77) if train: constantBidModel.trainModel(trainData, searchRange=[1, 300], budget=int(6250*1000*8.88)) bids = constantBidModel.getBidPrice(validationData.bidid) # bids = np.apply_along_axis(constantBidModel.getBidPrice, axis=1, arr=validationData.getTestData()) if writeResult2CSV: ipinyouWriter.ResultWriter().writeResult("resultConstantBidModel.csv", bids) # myEvaluator = Evaluator.Evaluator(25000*1000, bids, validationData.getTrainData()) # myEvaluator.computePerformanceMetrics() myEvaluator = Evaluator() myEvaluator.computePerformanceMetricsDF(6250 * 1000, bids, validationData) myEvaluator.printResult()
def exeXGBoostBidModel(validationData, trainData=None, testData=None, writeResult2CSV=False, testMode=True): Y_column = 'click' X_column = list(trainDF) unwanted_Column = ['click', 'bidid', 'bidprice', 'payprice', 'userid', 'IP', 'url', 'creative', 'keypage'] [X_column.remove(i) for i in unwanted_Column] xgd = XGBoostBidModel(X_column, Y_column) xgd.trainModel(trainData) bids = xgd.getBidPrice(validationData) if writeResult2CSV: ipinyouWriter.ResultWriter().writeResult("resultXGBoostBidModel.csv", bids) if not testMode: myEvaluator = Evaluator() myEvaluator.computePerformanceMetricsDF(6250 * 1000, bids, validationData) myEvaluator.printResult() return xgd.getY_Pred(validationData),xgd.getY_Pred(testData)
def exeLogisticRegressionBidModel(validationData=None, trainData=None, writeResult2CSV=False): # Get regressionFormulaX X_column = list(trainData) unwanted_Column = ['click', 'bidid', 'bidprice', 'payprice', 'userid', 'IP', 'url', 'creative', 'keypage'] [X_column.remove(i) for i in unwanted_Column] final_x = X_column[0] for i in range(1, len(X_column)): final_x = final_x + ' + ' + X_column[i] lrBidModel = LinearBidModel.LinearBidModel(regressionFormulaY='click', regressionFormulaX=final_x, cBudget=272.412385 * 1000, avgCTR=0.2, modelType='logisticregression') print(type(validationData)) lrBidModel.trainModel(trainData, retrain=True, modelFile="LogisticRegression.pkl") # lrBidModel.gridSearchandCrossValidate(trainData.getDataFrame()) bids = lrBidModel.getBidPrice(validationData) if writeResult2CSV: ipinyouWriter.ResultWriter().writeResult("LRbidModelresult.csv", bids) myEvaluator = Evaluator.Evaluator() myEvaluator.computePerformanceMetricsDF(6250*1000, bids, validationData) myEvaluator.printResult()
def exeEnsemble_Weighted(trainDF, validateDF, testDF, trainPath, validationPath, testPath, trainReader, validateReader, testReader, writeResult2CSV=False): ''' Takes the average of y_pred from all models. ''' xg_val_y_pred, xg_test_y_pred = exeXGBoostBidModel(validationData=validateDF, trainData=trainDF, testData=testDF, writeResult2CSV=False) cnn_val_y_pred, cnn_test_y_pred = exeCNNBidModel(validationDataPath=validationPath, trainDataPath=trainPath, testDataPath=testPath, writeResult2CSV=False) #lr_y_pred = exeLogisticRegressionBidModel_v2(validationReader=validationReader, trainReader=trainReader, writeResult2CSV=False) #fm_y_pred=exeFMBidModel(trainReader=trainReader, validationReader=validateReader, testReader=testReader, writeResult2CSV=False) # Average them # y_pred = [(xg+ lr) / 2.0 for xg, lr in zip(xg_y_pred, lr_y_pred)] # y_pred = [(xg + cnn + lr)/3.0 for xg, cnn, lr in zip(xg_y_pred, cnn_y_pred, lr_y_pred)] #y_pred = [(xg*0.4 + cnn*0.4 + lr*0.05 + fm*0.15) for xg, cnn, lr, fm in zip(xg_y_pred, cnn_y_pred, lr_y_pred, fm_y_pred)] # AUC 87.80 #This one hits 0.874 for the xg/lr/fm emsemble models, perviously 0.861 (Can't run CNN on my mac yet, got this convolution missing error) # y_pred = [(xg * 0.6 + lr * 0.1 + fm * 0.3) for xg, lr, fm in zip(xg_y_pred, lr_y_pred, fm_y_pred)] #ongmin testing # y_pred = [(xg * 0.5 + cnn * 0.5 + lr * 0.05 + fm * 0.15) for xg, cnn, lr, fm in zip(xg_y_pred, cnn_y_pred, lr_y_pred, fm_y_pred)] # AUC 0.8760 # y_pred = [(xg * 0.6 + cnn * 0.4 + lr * 0.00 + fm * 0.00) for xg, cnn, lr, fm in zip(xg_y_pred, cnn_y_pred, lr_y_pred, fm_y_pred)] # AUC 0.8810 # y_pred = [(xg*0.5 + cnn*0.5 + lr*0.00 + fm*0.00) for xg, cnn, lr, fm in zip(xg_y_pred, cnn_y_pred, lr_y_pred, fm_y_pred)] # AUC 0.8797 #y_pred = [(xg * 0.7 + cnn * 0.3 + lr * 0.00 + fm * 0.00) for xg, cnn, lr, fm in zip(xg_y_pred, cnn_y_pred, lr_y_pred, fm_y_pred)] # AUC 0.8840 #y_pred = [(xg * 0.8 + cnn * 0.2 + lr * 0.00 + fm * 0.00) for xg, cnn, lr, fm in zip(xg_y_pred, cnn_y_pred, lr_y_pred, fm_y_pred)] # AUC 0.8836 val_y_pred = [(xg * 0.7 + cnn * 0.3 ) for xg, cnn in zip(xg_val_y_pred, cnn_val_y_pred)] # AUC 0.8840 timestamp=str(time.strftime("%Y%m%d-%H%M%S")) print("XGBoost AUC:") ClickEvaluator().clickROC(validateDF['click'], xg_val_y_pred, imgpath="./SavedEnsembleInfo/XGBoost_AUC-" + timestamp + ".jpg") print("CNN AUC:") ClickEvaluator().clickROC(validateDF['click'], cnn_val_y_pred, imgpath="./SavedEnsembleInfo/CNN_AUC-" + timestamp + ".jpg") # print("Logistic AUC:") # ClickEvaluator().clickROC(validateDF['click'], lr_y_pred, imgpath="./SavedEnsembleInfo/LogisticR_AUC-" + timestamp + ".jpg") # print("FastFM AUC:") # ClickEvaluator().clickROC(validateDF['click'], fm_y_pred, imgpath="./SavedEnsembleInfo/FastFM_AUC-" + timestamp + ".jpg") print("Ensemble AUC:") ClickEvaluator().clickROC(validateDF['click'], val_y_pred, imgpath="./SavedEnsembleInfo/ensemble_weighted_AUC-" + timestamp + ".jpg", showGraph=False) val_y_pred = np.array(val_y_pred) click1 = val_y_pred[validateDF.click == 1] n, bins, patches = ClickEvaluator().clickProbHistogram(pred_prob=click1, color='g', title='Predicted probabilities for clicks=1', imgpath="./SavedEnsembleInfo/ensemble_weighted-click1-" + timestamp + ".jpg", showGraph=False) # click=0 prediction as click=1 probabilities click0 = val_y_pred[validateDF.click == 0] n, bins, patches = ClickEvaluator().clickProbHistogram(pred_prob=click0, color='r', title='Predicted probabilities for clicks=0', imgpath="./SavedEnsembleInfo/ensemble_weighted-click0-" + timestamp + ".jpg", showGraph=False) ### Bid price model evaluations test_y_pred = [(xg * 0.7 + cnn * 0.3 ) for xg, cnn in zip(xg_test_y_pred, cnn_test_y_pred)] slotprices_val = validateDF['slotprice'].as_matrix().astype(int) slotprices_test = testDF['slotprice'].as_matrix().astype(int) print("=== Get best bid prices on validation set") #avg_ctr = ClickEvaluator().compute_avgCTR(trainDF.click) #TODO override with complete train set avg ctr avg_ctr = 0.00075 print("Train avgCTR = {}".format(avg_ctr)) bid_estimator = BidEstimator() print("== linearBidPrice") best_pred_thresh, best_base_bid, perf_df = bid_estimator.gridSearch_bidPrice(val_y_pred, avg_ctr, slotprices_val, validateDF, bidpriceest_model='linearBidPrice') ipinyouWriter.ResultWriter().writeResult("./SavedEnsembleInfo/ensemble_weighted-linearBidPrice-"+ timestamp +".csv",perf_df) # print("= linearBidPrice estimate test bids") bids = bid_estimator.linearBidPrice(test_y_pred, best_base_bid, avg_ctr) # format bids into bidids pandas frame bids_df = pd.concat([testDF['bidid'], pd.DataFrame(bids, columns=['bidprice'], index=testDF['bidid'].index)],axis=1) ipinyouWriter.ResultWriter().writeResult("./SavedEnsembleInfo/ensemble_weighted-testbids-"+ timestamp +".csv", bids_df)