def testWithEvalSet(): spark = test_helpers.getOrCreateSparkSession(test_helpers.getCurrentMethodName()) import catboost_spark featureNames = ["f1", "f2", "f3"] srcSchemaData = [ ("features", VectorUDT()), ("label", StringType()), ("groupId", LongType()), ("subgroupId", IntegerType()), ("weight", FloatType()) ] srcTrainData = [ Row(Vectors.dense(0.13, 0.22, 0.23), "0.34", 0x86F1B93B695F9E6, 0x23D794E, 1.0), Row(Vectors.dense(0.1, 0.2, 0.11), "0.12", 0xB337C6FEFE2E2F7, 0xD34BFBD, 0.12), Row(Vectors.dense(0.97, 0.82, 0.33), "0.22", 0xB337C6FEFE2E2F7, 0x19CE5B0, 0.18), Row(Vectors.dense(0.9, 0.67, 0.17), "0.01", 0xD9DBDD3199D6518, 0x19CE5B0, 1.0), Row(Vectors.dense(0.66, 0.1, 0.31), "0.0", 0xD9DBDD3199D6518, 0x1FA606F, 2.0), Row(Vectors.dense(0.14, 0.18, 0.1), "0.42", 0xD9DBDD3199D6518, 0x62772D1, 0.45) ] srcTestData = [ Row(Vectors.dense(0.0, 0.33, 1.1), "0.22", 0x4AAFFF456765757, 0xD34BFBD, 0.1), Row(Vectors.dense(0.02, 0.0, 0.38), "0.11", 0x686726738873ABC, 0x23D794E, 1.0), Row(Vectors.dense(0.86, 0.54, 0.9), "0.48", 0x7652786FF37ABBE, 0x19CE5B0, 0.17) ] trainPool = pool_test_helpers.createRawPool( test_helpers.getCurrentMethodName, pool_test_helpers.createSchema( srcSchemaData, featureNames, addFeatureNamesMetadata=True ), srcTrainData, {"groupId": "groupId", "subgroupId": "subgroupId", "weight": "weight"} ) testPool = pool_test_helpers.createRawPool( test_helpers.getCurrentMethodName, pool_test_helpers.createSchema( srcSchemaData, featureNames, addFeatureNamesMetadata=True ), srcTestData, {"groupId": "groupId", "subgroupId": "subgroupId", "weight": "weight"} ) regressor = (catboost_spark.CatBoostRegressor() .setIterations(20) .setTrainDir(tempfile.mkdtemp(prefix=test_helpers.getCurrentMethodName())) ) model = regressor.fit(trainPool, [testPool]) predictions = model.transform(testPool.data) print ("predictions") predictions.show(truncate=False)
def testBinaryClassificationWithClassWeightsMap(): spark = test_helpers.getOrCreateSparkSession( test_helpers.getCurrentMethodName()) import catboost_spark featureNames = ["f1", "f2", "f3"] srcSchemaData = [("features", VectorUDT()), ("label", IntegerType())] srcData = [ Row(Vectors.dense(0.1, 0.2, 0.11), 0), Row(Vectors.dense(0.97, 0.82, 0.33), 1), Row(Vectors.dense(0.13, 0.22, 0.23), 1), Row(Vectors.dense(0.14, 0.18, 0.1), 0), Row(Vectors.dense(0.9, 0.67, 0.17), 0), Row(Vectors.dense(0.66, 0.1, 0.31), 0) ] pool = pool_test_helpers.createRawPool( test_helpers.getCurrentMethodName, pool_test_helpers.createSchema(srcSchemaData, featureNames, addFeatureNamesMetadata=True), srcData, {}) classWeightsMap = collections.OrderedDict([("0", 1.0), ("1", 2.0)]) classifier = (catboost_spark.CatBoostClassifier().setIterations( 20).setClassWeightsMap(classWeightsMap).setLoggingLevel( catboost_spark.ELoggingLevel.Debug).setTrainDir( tempfile.mkdtemp(prefix=test_helpers.getCurrentMethodName()))) model = classifier.fit(pool) predictions = model.transform(pool.data) predictions.show(truncate=False)
def testBinaryClassificationWithTargetBorder(): spark = test_helpers.getOrCreateSparkSession( test_helpers.getCurrentMethodName()) import catboost_spark featureNames = ["f1", "f2", "f3"] srcSchemaData = [("features", VectorUDT()), ("label", DoubleType())] srcData = [ Row(Vectors.dense(0.1, 0.2, 0.11), 0.12), Row(Vectors.dense(0.97, 0.82, 0.33), 0.1), Row(Vectors.dense(0.13, 0.22, 0.23), 0.7), Row(Vectors.dense(0.14, 0.18, 0.1), 0.33), Row(Vectors.dense(0.9, 0.67, 0.17), 0.82), Row(Vectors.dense(0.66, 0.1, 0.31), 0.93) ] pool = pool_test_helpers.createRawPool( test_helpers.getCurrentMethodName, pool_test_helpers.createSchema(srcSchemaData, featureNames, addFeatureNamesMetadata=True), srcData, {}) classifier = (catboost_spark.CatBoostClassifier().setIterations( 20).setTargetBorder(0.5).setTrainDir( tempfile.mkdtemp(prefix=test_helpers.getCurrentMethodName()))) model = classifier.fit(pool) predictions = model.transform(pool.data) predictions.show(truncate=False)
def testSimpleBinaryClassification(): spark = test_helpers.getOrCreateSparkSession( test_helpers.getCurrentMethodName()) import catboost_spark featureNames = ["f1", "f2", "f3"] srcSchemaData = [("features", VectorUDT()), ("label", StringType()), ("groupId", LongType()), ("groupWeight", FloatType()), ("subgroupId", IntegerType()), ("weight", FloatType())] srcData = [ Row(Vectors.dense(0.1, 0.2, 0.11), "0", 0xB337C6FEFE2E2F7, 1.0, 0xD34BFBD, 0.12), Row(Vectors.dense(0.97, 0.82, 0.33), "0", 0xB337C6FEFE2E2F7, 1.0, 0x19CE5B0, 0.18), Row(Vectors.dense(0.13, 0.22, 0.23), "1", 0x86F1B93B695F9E6, 0.0, 0x23D794E, 1.0), Row(Vectors.dense(0.14, 0.18, 0.1), "1", 0xD9DBDD3199D6518, 0.5, 0x62772D1, 0.45), Row(Vectors.dense(0.9, 0.67, 0.17), "0", 0xD9DBDD3199D6518, 0.5, 0x19CE5B0, 1.0), Row(Vectors.dense(0.66, 0.1, 0.31), "1", 0xD9DBDD3199D6518, 0.5, 0x1FA606F, 2.0) ] pool = pool_test_helpers.createRawPool( test_helpers.getCurrentMethodName, pool_test_helpers.createSchema(srcSchemaData, featureNames, addFeatureNamesMetadata=True), srcData, { "groupId": "groupId", "groupWeight": "groupWeight", "subgroupId": "subgroupId", "weight": "weight" }) classifier = ( catboost_spark.CatBoostClassifier().setIterations(20).setTrainDir( tempfile.mkdtemp(prefix=test_helpers.getCurrentMethodName()))) model = classifier.fit(pool) predictions = model.transform(pool.data) for rawPrediction in [False, True]: for probability in [False, True]: for prediction in [False, True]: model.setRawPredictionCol("rawPrediction" if ( rawPrediction) else "") model.setProbabilityCol("probability" if (probability) else "") model.setPredictionCol("prediction" if (prediction) else "") predictions = model.transform(pool.data) print('\nrawPrediction=%s, probability=%s, prediction=%s' % (rawPrediction, probability, prediction)) predictions.show(truncate=False)
def testWithCrossValidator(): spark = test_helpers.getOrCreateSparkSession( test_helpers.getCurrentMethodName()) import catboost_spark featureNames = ["f1", "f2", "f3"] srcDataSchema = pool_test_helpers.createSchema( [("features", VectorUDT()), ("label", DoubleType())], featureNames, addFeatureNamesMetadata=True) srcData = [ Row(Vectors.dense(0.1, 0.2, 0.11), 1.0), Row(Vectors.dense(0.97, 0.82, 0.33), 2.0), Row(Vectors.dense(0.13, 0.22, 0.23), 2.0), Row(Vectors.dense(0.14, 0.18, 0.1), 1.0), Row(Vectors.dense(0.9, 0.67, 0.17), 2.0), Row(Vectors.dense(0.66, 0.1, 0.31), 1.0), Row(Vectors.dense(0.13, 0.21, 0.6), 1.0), Row(Vectors.dense(0.9, 0.82, 0.04), 2.0), Row(Vectors.dense(0.87, 0.92, 1.0), 2.0), Row(Vectors.dense(0.0, 0.1, 0.1), 1.0), Row(Vectors.dense(0.0, 0.78, 0.19), 1.0), Row(Vectors.dense(0.1, 0.33, 0.28), 2.0), Row(Vectors.dense(0.01, 0.5, 0.2), 1.0), Row(Vectors.dense(0.2, 0.99, 1.0), 1.0), Row(Vectors.dense(0.56, 0.43, 0.88), 2.0), Row(Vectors.dense(0.98, 0.02, 0.73), 2.0) ] df = spark.createDataFrame(spark.sparkContext.parallelize(srcData), StructType(srcDataSchema)) spark_cv_grid_params = pyspark.ml.tuning.ParamGridBuilder().addGrid( catboost_spark.CatBoostClassifier().depth, [3, 5]).build() estimator = catboost_spark.CatBoostClassifier(iterations=20) bce = pyspark.ml.evaluation.BinaryClassificationEvaluator( rawPredictionCol="probability", labelCol="label") cv = pyspark.ml.tuning.CrossValidator( estimator=estimator, estimatorParamMaps=spark_cv_grid_params, evaluator=bce, numFolds=3, seed=1) cv.fit(df)
def testQuantize(): featureNames = ["f1", "f2", "f3"] spark = test_helpers.getOrCreateSparkSession( test_helpers.getCurrentMethodName()) import catboost_spark implTestQuantizeCase( pool_test_helpers.createSchema([("features", VectorUDT()), ("label", DoubleType())], featureNames, addFeatureNamesMetadata=True), srcData=[ Row(Vectors.dense(0.0, 1.0, 0.2), 0.0), Row(Vectors.dense(0.1, 1.1, 2.1), 1.0), Row(Vectors.dense(0.2, 1.2, 2.2), 1.0), Row(Vectors.dense(0.0, 1.1, 3.2), 0.0) ], quantizationParams=catboost_spark.QuantizationParams())
def testQuantizeWithNaNsAndIgnoredFeatures(): featureNames = ["F1", "F2", "F3", "F4"] spark = test_helpers.getOrCreateSparkSession( test_helpers.getCurrentMethodName()) import catboost_spark implTestQuantizeCase( pool_test_helpers.createSchema([("features", VectorUDT()), ("label", DoubleType())], featureNames, addFeatureNamesMetadata=True), srcData=[ Row(Vectors.dense(0.0, 1.0, 0.2, 100.11), 3.0), Row(Vectors.dense(float('nan'), 1.1, float('nan'), 20.2), 1.0), Row(Vectors.dense(0.2, 1.2, 2.2, 32.4), 11.0), Row(Vectors.dense(float('nan'), 0.0, 2.2, 71.1), 0.2), Row(Vectors.dense(float('nan'), 1.1, 0.4, 92.2), 6.1), Row(Vectors.dense(0.1, 0.0, 1.8, 111.0), 2.0), Row(Vectors.dense(0.28, 0.0, 8.3, 333.2), 0.0) ], quantizationParams=catboost_spark.QuantizationParams( borderCount=2, ignoredFeaturesIndices=[0, 2]))
def testModelSerialization(): spark = test_helpers.getOrCreateSparkSession( test_helpers.getCurrentMethodName()) import catboost_spark featureNames = ["f1", "f2", "f3"] srcDataSchema = pool_test_helpers.createSchema( [("features", VectorUDT()), ("label", DoubleType())], featureNames, addFeatureNamesMetadata=True) srcData = [ Row(Vectors.dense(0.1, 0.2, 0.11), 1.0), Row(Vectors.dense(0.97, 0.82, 0.33), 2.0), Row(Vectors.dense(0.13, 0.22, 0.23), 2.0), Row(Vectors.dense(0.14, 0.18, 0.1), 1.0), Row(Vectors.dense(0.9, 0.67, 0.17), 2.0), Row(Vectors.dense(0.66, 0.1, 0.31), 1.0) ] df = spark.createDataFrame(spark.sparkContext.parallelize(srcData), StructType(srcDataSchema)) classifier = ( catboost_spark.CatBoostClassifier().setIterations(20).setTrainDir( tempfile.mkdtemp(prefix=test_helpers.getCurrentMethodName()))) model = classifier.fit(df) predictions = model.transform(df) print("predictions") predictions.show(truncate=False) modelsDir = tempfile.mkdtemp(prefix="catboost_models_") nativeCatBoostModelPath = os.path.join(modelsDir, "binclass_model_on_df.cbm") model.saveNativeModel(nativeCatBoostModelPath) loadedCatBoostModel = catboost_spark.CatBoostClassificationModel.loadNativeModel( nativeCatBoostModelPath) predictionsLoadedCatBoost = loadedCatBoostModel.transform(df) print("predictionsLoadedCatBoost") predictionsLoadedCatBoost.show(truncate=False) nativeJsonModelPath = os.path.join(modelsDir, "binclass_model_on_df.json") model.saveNativeModel(nativeJsonModelPath, catboost_spark.EModelType.Json) nativeOnnxModelPath = os.path.join(modelsDir, "binclass_model_on_df.onnx") model.saveNativeModel( nativeOnnxModelPath, catboost_spark.EModelType.Onnx, { "onnx_domain": "ai.catboost", "onnx_model_version": 1, "onnx_doc_string": "test model for classification", "onnx_graph_name": "CatBoostModel_for_classification" }) loadedOnnxModel = catboost_spark.CatBoostClassificationModel.loadNativeModel( nativeOnnxModelPath, catboost_spark.EModelType.Onnx) predictionsLoadedOnnx = loadedOnnxModel.transform(df) print("predictionsLoadedOnnx") predictionsLoadedOnnx.show(truncate=False) sparkModelPath = os.path.join(modelsDir, "binclass_model_on_df") model.write().overwrite().save(sparkModelPath) loadedModel = catboost_spark.CatBoostClassificationModel.load( sparkModelPath) predictionsLoaded = loadedModel.transform(df) print("predictionsLoaded") predictionsLoaded.show(truncate=False) shutil.rmtree(modelsDir)