Пример #1
0
def train(spark):
    if config['XGBOOST']['checkpointInitialization'] == 'true':
        checkpoint_path = config['XGBOOST']['checkpoint_path']
        op = os.system("hadoop fs -rmr %s/*" % checkpoint_path)
        if not op:
            print("initialize checkpoint successfully.")
    train_df = Hdfs2Df.readHdfsCsv(spark=spark,
                                   data_path=config['TRAIN']['train_path'])
    test_df = Hdfs2Df.readHdfsCsv(spark=spark,
                                  data_path=config['TRAIN']['test_path'])

    missing = config['XGBOOST']['missing']
    train_df = PreProcessor.transColType(train_df, missing)
    test_df = PreProcessor.transColType(test_df, missing)
    train, train_col = PreProcessor.transVector(train_df, 'features')
    test, test_col = PreProcessor.transVector(train_df, 'features')

    SavaTools.saveModelFeature(train_col,
                               config['TRAIN']['local_model_feature_path'])
    xgb_handle = XGBoostClassifier(config['XGBOOST'])
    xgbModel = xgb_handle.trainAndSave(spark, train,
                                       config['TRAIN']['hdfs_model_path'])

    train_res, train_auc = xgb_handle.predict(spark, train, xgbModel)
    test_res, test_auc = xgb_handle.predict(spark, test, xgbModel)
    train_res.cache()
    test_res.cache()

    evaluator_handle = Evaluator(spark)
    train_ks = evaluator_handle.evaluateKs(train_res, 'train_res', 'score')
    train_auc = evaluator_handle.evaluateAuc(train_res, "score")
    test_ks = evaluator_handle.evaluateKs(test_res, 'test_ks', 'score')
    test_auc = evaluator_handle.evaluateAuc(test_res, "score")

    fscore = xgbModel.booster.getFeatureScore()
    xgb_handle.saveFeatureImportance(
        train_col, fscore, config['TRAIN']['local_model_feature_weights_path'],
        train_auc, test_auc, train_ks, test_ks)
    SavaTools.saveHdfsFile(train_res, config['TRAIN']['train_res_path'])
    SavaTools.saveHdfsFile(train_res, config['TRAIN']['test_res_path'])
Пример #2
0
 def predict(self, spark, tmp, xgb):
     data = PreProcessor.transVector(tmp, 'features')
     predictions = xgb.predict(
         data, -999).map(lambda row: (row['predictions'][1], row['label']))
     predictions = predictions.toDF("score", "label")
     right = predictions.withColumn("idx", monotonically_increasing_id())
     left = tmp.select(['name', 'idcard',
                        'phone']).withColumn("idx",
                                             monotonically_increasing_id())
     res_df = left.join(right, ['idx'], 'inner').drop('idx')
     evaluator_handle = Evaluator(spark)
     auc = evaluator_handle.evaluateAuc(res_df)
     print("AUC: ", auc)
     return res_df, auc