def train(spark): if config['XGBOOST']['checkpointInitialization'] == 'true': checkpoint_path = config['XGBOOST']['checkpoint_path'] op = os.system("hadoop fs -rmr %s/*" % checkpoint_path) if not op: print("initialize checkpoint successfully.") train_df = Hdfs2Df.readHdfsCsv(spark=spark, data_path=config['TRAIN']['train_path']) test_df = Hdfs2Df.readHdfsCsv(spark=spark, data_path=config['TRAIN']['test_path']) missing = config['XGBOOST']['missing'] train_df = PreProcessor.transColType(train_df, missing) test_df = PreProcessor.transColType(test_df, missing) train, train_col = PreProcessor.transVector(train_df, 'features') test, test_col = PreProcessor.transVector(train_df, 'features') SavaTools.saveModelFeature(train_col, config['TRAIN']['local_model_feature_path']) xgb_handle = XGBoostClassifier(config['XGBOOST']) xgbModel = xgb_handle.trainAndSave(spark, train, config['TRAIN']['hdfs_model_path']) train_res, train_auc = xgb_handle.predict(spark, train, xgbModel) test_res, test_auc = xgb_handle.predict(spark, test, xgbModel) train_res.cache() test_res.cache() evaluator_handle = Evaluator(spark) train_ks = evaluator_handle.evaluateKs(train_res, 'train_res', 'score') train_auc = evaluator_handle.evaluateAuc(train_res, "score") test_ks = evaluator_handle.evaluateKs(test_res, 'test_ks', 'score') test_auc = evaluator_handle.evaluateAuc(test_res, "score") fscore = xgbModel.booster.getFeatureScore() xgb_handle.saveFeatureImportance( train_col, fscore, config['TRAIN']['local_model_feature_weights_path'], train_auc, test_auc, train_ks, test_ks) SavaTools.saveHdfsFile(train_res, config['TRAIN']['train_res_path']) SavaTools.saveHdfsFile(train_res, config['TRAIN']['test_res_path'])
def predict(self, spark, tmp, xgb): data = PreProcessor.transVector(tmp, 'features') predictions = xgb.predict( data, -999).map(lambda row: (row['predictions'][1], row['label'])) predictions = predictions.toDF("score", "label") right = predictions.withColumn("idx", monotonically_increasing_id()) left = tmp.select(['name', 'idcard', 'phone']).withColumn("idx", monotonically_increasing_id()) res_df = left.join(right, ['idx'], 'inner').drop('idx') evaluator_handle = Evaluator(spark) auc = evaluator_handle.evaluateAuc(res_df) print("AUC: ", auc) return res_df, auc