示例#1
0
def testLoadDSVWithPairs():
    dataFile = pool_test_helpers.writeToTempFile(
        "0.12\tquery0\tsite1\t0.12\t1.0\t0.1\t0.2\t0.11\n" +
        "0.22\tquery0\tsite22\t0.18\t1.0\t0.97\t0.82\t0.33\n" +
        "0.34\tquery1\tSite9\t1.0\t0.0\t0.13\t0.22\t0.23\n" +
        "0.42\tQuery 2\tsite12\t0.45\t0.5\t0.14\t0.18\t0.1\n" +
        "0.01\tQuery 2\tsite22\t1.0\t0.5\t0.9\t0.67\t0.17\n" +
        "0.0\tQuery 2\tSite45\t2.0\t0.5\t0.66\t0.1\t0.31\n")
    cdFile = pool_test_helpers.writeToTempFile("0\tTarget\n" + "1\tGroupId\n" +
                                               "2\tSubgroupId\n" +
                                               "3\tWeight\n" +
                                               "4\tGroupWeight\n" +
                                               "5\tNum\tf0\n" +
                                               "6\tNum\tf1\n" + "7\tNum\tf2\n")
    pairsFile = pool_test_helpers.writeToTempFile("query0\t0\t1\n" +
                                                  "Query 2\t0\t2\n" +
                                                  "Query 2\t1\t2\n")

    spark = test_helpers.getOrCreateSparkSession(
        test_helpers.getCurrentMethodName())
    import catboost_spark
    pool = catboost_spark.Pool.load(spark,
                                    dataFile,
                                    columnDescription=cdFile,
                                    pairsDataPathWithScheme="dsv-grouped://" +
                                    pairsFile)

    pool_test_helpers.printPool(pool)
def testBinaryClassificationWithClassWeightsMap():
    spark = test_helpers.getOrCreateSparkSession(
        test_helpers.getCurrentMethodName())
    import catboost_spark

    featureNames = ["f1", "f2", "f3"]

    srcSchemaData = [("features", VectorUDT()), ("label", IntegerType())]

    srcData = [
        Row(Vectors.dense(0.1, 0.2, 0.11), 0),
        Row(Vectors.dense(0.97, 0.82, 0.33), 1),
        Row(Vectors.dense(0.13, 0.22, 0.23), 1),
        Row(Vectors.dense(0.14, 0.18, 0.1), 0),
        Row(Vectors.dense(0.9, 0.67, 0.17), 0),
        Row(Vectors.dense(0.66, 0.1, 0.31), 0)
    ]
    pool = pool_test_helpers.createRawPool(
        test_helpers.getCurrentMethodName,
        pool_test_helpers.createSchema(srcSchemaData,
                                       featureNames,
                                       addFeatureNamesMetadata=True), srcData,
        {})

    classWeightsMap = collections.OrderedDict([("0", 1.0), ("1", 2.0)])

    classifier = (catboost_spark.CatBoostClassifier().setIterations(
        20).setClassWeightsMap(classWeightsMap).setLoggingLevel(
            catboost_spark.ELoggingLevel.Debug).setTrainDir(
                tempfile.mkdtemp(prefix=test_helpers.getCurrentMethodName())))

    model = classifier.fit(pool)
    predictions = model.transform(pool.data)
    predictions.show(truncate=False)
def testSimpleOnDataFrame():
    spark = test_helpers.getOrCreateSparkSession(test_helpers.getCurrentMethodName())
    import catboost_spark

    featureNames = ["f1", "f2", "f3"]

    srcDataSchema = pool_test_helpers.createSchema(
        [
            ("features", VectorUDT()),
            ("label", DoubleType())
        ],
        featureNames,
        addFeatureNamesMetadata=True
    )

    srcData = [
      Row(Vectors.dense(0.1, 0.2, 0.11), 0.12),
      Row(Vectors.dense(0.97, 0.82, 0.33), 1.1),
      Row(Vectors.dense(0.13, 0.22, 0.23), 2.1),
      Row(Vectors.dense(0.14, 0.18, 0.1), 0.0),
      Row(Vectors.dense(0.9, 0.67, 0.17), -1.0),
      Row(Vectors.dense(0.66, 0.1, 0.31), 0.62)
    ]

    df = spark.createDataFrame(spark.sparkContext.parallelize(srcData), StructType(srcDataSchema))

    regressor = (catboost_spark.CatBoostRegressor()
      .setIterations(20)
      .setTrainDir(tempfile.mkdtemp(prefix=test_helpers.getCurrentMethodName())))
    model = regressor.fit(df)
    predictions = model.transform(df)

    print ("predictions")
    predictions.show(truncate=False)
示例#4
0
def testPredictionDiff():
    spark = test_helpers.getOrCreateSparkSession(test_helpers.getCurrentMethodName())
    import catboost_spark

    dataDir = os.path.join(config.CATBOOST_TEST_DATA_DIR, 'higgs')

    trainPool = catboost_spark.Pool.load(
      spark,
      dataPathWithScheme = os.path.join(dataDir, "train_small"),
      columnDescription = os.path.join(dataDir, "train.cd")
    )
    dataForPredictionDiff = catboost_spark.Pool(trainPool.data.limit(2))

    regressor = (catboost_spark.CatBoostRegressor()
      .setIterations(20)
      .setTrainDir(tempfile.mkdtemp(prefix=test_helpers.getCurrentMethodName())))
    model = regressor.fit(trainPool)


    featureImportances = model.getFeatureImportance(
        fstrType=catboost_spark.EFstrType.PredictionDiff,
        data=dataForPredictionDiff
    )
    print('featureImportancesPredictionDiff=')
    print(featureImportances)

    featureImportancesPrettified = model.getFeatureImportancePrettified(
        fstrType=catboost_spark.EFstrType.PredictionDiff,
        data=dataForPredictionDiff
    )

    print('featureImportancesPredictionDiffPrettified=')
    for e in featureImportancesPrettified:
        print('featureName={},importance={}'.format(e.featureName(), e.importance()))
def testBinaryClassificationWithTargetBorder():
    spark = test_helpers.getOrCreateSparkSession(
        test_helpers.getCurrentMethodName())
    import catboost_spark

    featureNames = ["f1", "f2", "f3"]

    srcSchemaData = [("features", VectorUDT()), ("label", DoubleType())]

    srcData = [
        Row(Vectors.dense(0.1, 0.2, 0.11), 0.12),
        Row(Vectors.dense(0.97, 0.82, 0.33), 0.1),
        Row(Vectors.dense(0.13, 0.22, 0.23), 0.7),
        Row(Vectors.dense(0.14, 0.18, 0.1), 0.33),
        Row(Vectors.dense(0.9, 0.67, 0.17), 0.82),
        Row(Vectors.dense(0.66, 0.1, 0.31), 0.93)
    ]
    pool = pool_test_helpers.createRawPool(
        test_helpers.getCurrentMethodName,
        pool_test_helpers.createSchema(srcSchemaData,
                                       featureNames,
                                       addFeatureNamesMetadata=True), srcData,
        {})

    classifier = (catboost_spark.CatBoostClassifier().setIterations(
        20).setTargetBorder(0.5).setTrainDir(
            tempfile.mkdtemp(prefix=test_helpers.getCurrentMethodName())))

    model = classifier.fit(pool)
    predictions = model.transform(pool.data)
    predictions.show(truncate=False)
示例#6
0
def testInteraction():
    spark = test_helpers.getOrCreateSparkSession(test_helpers.getCurrentMethodName())
    import catboost_spark

    dataDir = os.path.join(config.CATBOOST_TEST_DATA_DIR, 'querywise')

    trainPool = catboost_spark.Pool.load(
      spark,
      dataPathWithScheme = os.path.join(dataDir, "train"),
      columnDescription = os.path.join(dataDir, "train.cd")
    )

    regressor = (catboost_spark.CatBoostRegressor()
      .setIterations(20)
      .setLossFunction("QueryRMSE")
      .setTrainDir(tempfile.mkdtemp(prefix=test_helpers.getCurrentMethodName())))
    model = regressor.fit(trainPool)

    featureImportancesInteraction = model.getFeatureImportanceInteraction()
    for e in featureImportancesInteraction:
        print(
            'firstFeatureIdx={},secondFeatureIdx={},score={}'.format(
                e.firstFeatureIdx(),
                e.secondFeatureIdx(),
                e.score()
            )
        )
def testWithEvalSet():
    spark = test_helpers.getOrCreateSparkSession(test_helpers.getCurrentMethodName())
    import catboost_spark

    featureNames = ["f1", "f2", "f3"]

    srcSchemaData = [
        ("features", VectorUDT()),
        ("label", StringType()),
        ("groupId", LongType()),
        ("subgroupId", IntegerType()),
        ("weight", FloatType())
    ]

    srcTrainData = [
      Row(Vectors.dense(0.13, 0.22, 0.23), "0.34", 0x86F1B93B695F9E6, 0x23D794E, 1.0),
      Row(Vectors.dense(0.1, 0.2, 0.11), "0.12", 0xB337C6FEFE2E2F7, 0xD34BFBD, 0.12),
      Row(Vectors.dense(0.97, 0.82, 0.33), "0.22", 0xB337C6FEFE2E2F7, 0x19CE5B0, 0.18),
      Row(Vectors.dense(0.9, 0.67, 0.17), "0.01", 0xD9DBDD3199D6518, 0x19CE5B0, 1.0),
      Row(Vectors.dense(0.66, 0.1, 0.31), "0.0", 0xD9DBDD3199D6518, 0x1FA606F, 2.0),
      Row(Vectors.dense(0.14, 0.18, 0.1), "0.42", 0xD9DBDD3199D6518, 0x62772D1, 0.45)
    ]
    srcTestData = [
      Row(Vectors.dense(0.0, 0.33, 1.1), "0.22", 0x4AAFFF456765757, 0xD34BFBD, 0.1),
      Row(Vectors.dense(0.02, 0.0, 0.38), "0.11", 0x686726738873ABC, 0x23D794E, 1.0),
      Row(Vectors.dense(0.86, 0.54, 0.9), "0.48", 0x7652786FF37ABBE, 0x19CE5B0, 0.17)
    ]

    trainPool = pool_test_helpers.createRawPool(
        test_helpers.getCurrentMethodName,
        pool_test_helpers.createSchema(
          srcSchemaData,
          featureNames,
          addFeatureNamesMetadata=True
        ),
        srcTrainData,
        {"groupId": "groupId", "subgroupId": "subgroupId", "weight": "weight"}
    )
    testPool = pool_test_helpers.createRawPool(
        test_helpers.getCurrentMethodName,
        pool_test_helpers.createSchema(
          srcSchemaData,
          featureNames,
          addFeatureNamesMetadata=True
        ),
        srcTestData,
        {"groupId": "groupId", "subgroupId": "subgroupId", "weight": "weight"}
    )

    regressor = (catboost_spark.CatBoostRegressor()
      .setIterations(20)
      .setTrainDir(tempfile.mkdtemp(prefix=test_helpers.getCurrentMethodName()))
    )
    model = regressor.fit(trainPool, [testPool])
    predictions = model.transform(testPool.data)

    print ("predictions")
    predictions.show(truncate=False)
def testSimpleBinaryClassification():
    spark = test_helpers.getOrCreateSparkSession(
        test_helpers.getCurrentMethodName())
    import catboost_spark

    featureNames = ["f1", "f2", "f3"]

    srcSchemaData = [("features", VectorUDT()), ("label", StringType()),
                     ("groupId", LongType()), ("groupWeight", FloatType()),
                     ("subgroupId", IntegerType()), ("weight", FloatType())]

    srcData = [
        Row(Vectors.dense(0.1, 0.2, 0.11), "0", 0xB337C6FEFE2E2F7, 1.0,
            0xD34BFBD, 0.12),
        Row(Vectors.dense(0.97, 0.82, 0.33), "0", 0xB337C6FEFE2E2F7, 1.0,
            0x19CE5B0, 0.18),
        Row(Vectors.dense(0.13, 0.22, 0.23), "1", 0x86F1B93B695F9E6, 0.0,
            0x23D794E, 1.0),
        Row(Vectors.dense(0.14, 0.18, 0.1), "1", 0xD9DBDD3199D6518, 0.5,
            0x62772D1, 0.45),
        Row(Vectors.dense(0.9, 0.67, 0.17), "0", 0xD9DBDD3199D6518, 0.5,
            0x19CE5B0, 1.0),
        Row(Vectors.dense(0.66, 0.1, 0.31), "1", 0xD9DBDD3199D6518, 0.5,
            0x1FA606F, 2.0)
    ]
    pool = pool_test_helpers.createRawPool(
        test_helpers.getCurrentMethodName,
        pool_test_helpers.createSchema(srcSchemaData,
                                       featureNames,
                                       addFeatureNamesMetadata=True), srcData,
        {
            "groupId": "groupId",
            "groupWeight": "groupWeight",
            "subgroupId": "subgroupId",
            "weight": "weight"
        })

    classifier = (
        catboost_spark.CatBoostClassifier().setIterations(20).setTrainDir(
            tempfile.mkdtemp(prefix=test_helpers.getCurrentMethodName())))

    model = classifier.fit(pool)
    predictions = model.transform(pool.data)

    for rawPrediction in [False, True]:
        for probability in [False, True]:
            for prediction in [False, True]:
                model.setRawPredictionCol("rawPrediction" if (
                    rawPrediction) else "")
                model.setProbabilityCol("probability" if (probability) else "")
                model.setPredictionCol("prediction" if (prediction) else "")
                predictions = model.transform(pool.data)

                print('\nrawPrediction=%s, probability=%s, prediction=%s' %
                      (rawPrediction, probability, prediction))
                predictions.show(truncate=False)
示例#9
0
def testModelSerializationInPipeline():
    spark = test_helpers.getOrCreateSparkSession(
        test_helpers.getCurrentMethodName())
    import catboost_spark

    srcData = [
        Row(0, "query0", 0.1, "Male", 0.2, "Germany", 0.11),
        Row(1, "query0", 0.97, "Female", 0.82, "Russia", 0.33),
        Row(1, "query1", 0.13, "Male", 0.22, "USA", 0.23),
        Row(0, "Query 2", 0.14, "Male", 0.18, "Finland", 0.1),
        Row(1, "Query 2", 0.9, "Female", 0.67, "USA", 0.17),
        Row(0, "Query 2", 0.66, "Female", 0.1, "UK", 0.31)
    ]
    srcDataSchema = [
        StructField("Label", IntegerType()),
        StructField("GroupId", StringType()),
        StructField("float0", DoubleType()),
        StructField("Gender1", StringType()),
        StructField("float2", DoubleType()),
        StructField("Country3", StringType()),
        StructField("float4", DoubleType())
    ]

    df = spark.createDataFrame(spark.sparkContext.parallelize(srcData),
                               StructType(srcDataSchema))

    indexers = [
        StringIndexer(inputCol=catFeature, outputCol=catFeature + "Index")
        for catFeature in ["Gender1", "Country3"]
    ]
    assembler = VectorAssembler(inputCols=[
        "float0", "Gender1Index", "float2", "Country3Index", "float4"
    ],
                                outputCol="features")
    classifier = catboost_spark.CatBoostClassifier(labelCol="Label",
                                                   iterations=20)

    pipeline = Pipeline(stages=indexers + [assembler, classifier])
    pipelineModel = pipeline.fit(df)

    serializationDir = tempfile.mkdtemp(
        prefix=test_helpers.getCurrentMethodName())

    modelPath = os.path.join(serializationDir, "serialized_pipeline_model")

    pipelineModel.write().overwrite().save(modelPath)
    loadedPipelineModel = PipelineModel.load(modelPath)

    print("predictions")
    pipelineModel.transform(df).show(truncate=False)

    print("predictionsLoaded")
    loadedPipelineModel.transform(df).show(truncate=False)

    shutil.rmtree(serializationDir)
示例#10
0
def testLossFunctionChange():
    spark = test_helpers.getOrCreateSparkSession(test_helpers.getCurrentMethodName())
    import catboost_spark

    dataDir = os.path.join(config.CATBOOST_TEST_DATA_DIR, 'querywise')

    trainPool = catboost_spark.Pool.load(
      spark,
      dataPathWithScheme = os.path.join(dataDir, "train"),
      columnDescription = os.path.join(dataDir, "train.cd")
    )

    regressor = (catboost_spark.CatBoostRegressor()
      .setIterations(20)
      .setLossFunction("QueryRMSE")
      .setTrainDir(tempfile.mkdtemp(prefix=test_helpers.getCurrentMethodName())))
    model = regressor.fit(trainPool)

    calcTypes = [
        catboost_spark.ECalcTypeShapValues.Regular,
        catboost_spark.ECalcTypeShapValues.Approximate,
        catboost_spark.ECalcTypeShapValues.Exact
    ]

    for calcType in calcTypes:
        featureImportancesPredictionValuesChange = model.getFeatureImportance(
            fstrType=catboost_spark.EFstrType.LossFunctionChange,
            data=trainPool,
            calcType=calcType
        )
        print('calcType=' + str(calcType) + ',featureImportancesLossFunctionChange=')
        print(featureImportancesPredictionValuesChange)

        featureImportancesDefault = model.getFeatureImportance(data=trainPool, calcType=calcType)
        print('calcType=' + str(calcType) + ',featureImportancesDefault=')
        print(featureImportancesDefault)

        featureImportancesPredictionValuesChangePrettified = model.getFeatureImportancePrettified(
            fstrType=catboost_spark.EFstrType.LossFunctionChange,
            data=trainPool,
            calcType=calcType
        )
        print('calcType=' + str(calcType) + ',featureImportancesLossFunctionChangePrettified=')
        for e in featureImportancesPredictionValuesChangePrettified:
            print ('featureName={},importance={}'.format(e.featureName(), e.importance()))

        featureImportancesDefaultPrettified = model.getFeatureImportancePrettified(
            data=trainPool,
            calcType=calcType
        )
        print('calcType=' + str(calcType) + ',featureImportancesDefaultPrettified=')
        for e in featureImportancesDefaultPrettified:
            print ('featureName={},importance={}'.format(e.featureName(), e.importance()))
示例#11
0
def testLoadLibSVMSimple():
    dataFile = pool_test_helpers.writeToTempFile(
        "0 1:0.1 3:0.2\n" + "1 2:0.97 5:0.82 6:0.11 8:1.2\n" +
        "0 3:0.13 7:0.22 8:0.17\n")
    cdFile = pool_test_helpers.writeToTempFile("0\tTarget")

    spark = test_helpers.getOrCreateSparkSession(
        test_helpers.getCurrentMethodName())
    import catboost_spark
    pool = catboost_spark.Pool.load(spark, "libsvm://" + dataFile)

    pool_test_helpers.printPool(pool)
def testWithPairs():
    spark = test_helpers.getOrCreateSparkSession(test_helpers.getCurrentMethodName())
    import catboost_spark

    featureNames = ["f1", "f2", "f3"]

    srcSchemaData = [
        ("features", VectorUDT()),
        ("label", StringType()),
        ("groupId", LongType()),
        ("sampleId", LongType()),
        ("weight", FloatType())
    ]

    srcData = [
        Row(Vectors.dense(0.13, 0.22, 0.23), "0.34", 0x86F1B93B695F9E6, 0x23D794E, 1.0),
        Row(Vectors.dense(0.1, 0.2, 0.11), "0.12", 0xB337C6FEFE2E2F7, 0xD34BFBD, 0.12),
        Row(Vectors.dense(0.97, 0.82, 0.33), "0.22", 0xB337C6FEFE2E2F7, 0x19CE5B0, 0.18),
        Row(Vectors.dense(0.9, 0.67, 0.17), "0.01", 0xD9DBDD3199D6518, 0x19CE5B0, 1.0),
        Row(Vectors.dense(0.66, 0.1, 0.31), "0.0", 0xD9DBDD3199D6518, 0x1FA606F, 2.0),
        Row(Vectors.dense(0.14, 0.18, 0.1), "0.42", 0xD9DBDD3199D6518, 0x62772D1, 0.45)
    ]
    srcPairsData = [
        Row(0xB337C6FEFE2E2F7, 0xD34BFBD, 0x19CE5B0),
        Row(0xD9DBDD3199D6518, 0x19CE5B0, 0x62772D1),
        Row(0xD9DBDD3199D6518, 0x62772D1, 0x1FA606F)
    ]

    pool = pool_test_helpers.createRawPool(
        test_helpers.getCurrentMethodName,
        pool_test_helpers.createSchema(
          srcSchemaData,
          featureNames,
          addFeatureNamesMetadata=True
        ),
        srcData,
        {"groupId": "groupId", "sampleId": "sampleId", "weight": "weight"},
        srcPairsData
    )

    regressor = (catboost_spark.CatBoostRegressor()
      .setIterations(20)
      .setTrainDir(tempfile.mkdtemp(prefix=test_helpers.getCurrentMethodName()))
      .setLossFunction("PairLogit")
      .setHasTime(True)
    )

    model = regressor.fit(pool)
    predictions = model.transform(pool.data)

    print ("predictions")
    predictions.show(truncate=False)
示例#13
0
def testLoadDSVSimple():
    dataFile = pool_test_helpers.writeToTempFile("0\t0.1\t0.2\n" +
                                                 "1\t0.97\t0.82\n" +
                                                 "0\t0.13\t0.22\n")
    cdFile = pool_test_helpers.writeToTempFile("0\tTarget")

    spark = test_helpers.getOrCreateSparkSession(
        test_helpers.getCurrentMethodName())
    import catboost_spark
    pool = catboost_spark.Pool.load(spark, dataFile, columnDescription=cdFile)
    featureNames = ["_f0", "_f1"]

    pool.data.show(truncate=False)
    """
示例#14
0
def implTestQuantizeCase(srcDataSchema, srcData, quantizationParams):
    spark = test_helpers.getOrCreateSparkSession(
        test_helpers.getCurrentMethodName())
    import catboost_spark

    print("srcDataSchema=", srcDataSchema)

    df = spark.createDataFrame(spark.sparkContext.parallelize(srcData),
                               StructType(srcDataSchema))

    pool = catboost_spark.Pool(df)
    quantizedPool = pool.quantize(quantizationParams)

    pool_test_helpers.printPool(pool, 'raw')
    pool_test_helpers.printPool(quantizedPool, 'quantized')
def testParams():
    spark = test_helpers.getOrCreateSparkSession(test_helpers.getCurrentMethodName())
    import catboost_spark

    featureNames = ["f1", "f2", "f3"]

    srcSchemaData = [
        ("features", VectorUDT()),
        ("label", StringType()),
        ("groupId", LongType()),
        ("subgroupId", IntegerType()),
        ("weight", FloatType())
    ]

    srcData = [
      Row(Vectors.dense(0.13, 0.22, 0.23), "0.34", 0x86F1B93B695F9E6, 0x23D794E, 1.0),
      Row(Vectors.dense(0.1, 0.2, 0.11), "0.12", 0xB337C6FEFE2E2F7, 0xD34BFBD, 0.12),
      Row(Vectors.dense(0.97, 0.82, 0.33), "0.22", 0xB337C6FEFE2E2F7, 0x19CE5B0, 0.18),
      Row(Vectors.dense(0.9, 0.67, 0.17), "0.01", 0xD9DBDD3199D6518, 0x19CE5B0, 1.0),
      Row(Vectors.dense(0.66, 0.1, 0.31), "0.0", 0xD9DBDD3199D6518, 0x1FA606F, 2.0),
      Row(Vectors.dense(0.14, 0.18, 0.1), "0.42", 0xD9DBDD3199D6518, 0x62772D1, 0.45)
    ]
    pool = pool_test_helpers.createRawPool(
        test_helpers.getCurrentMethodName,
        pool_test_helpers.createSchema(
          srcSchemaData,
          featureNames,
          addFeatureNamesMetadata=True
        ),
        srcData,
        {"groupId": "groupId", "subgroupId": "subgroupId", "weight": "weight"}
    )

    firstFeatureUsePenaltiesMap = collections.OrderedDict([("f1", 0.0), ("f2", 1.1), ("f3", 2.0)])

    regressor = (catboost_spark.CatBoostRegressor()
      .setIterations(20)
      .setTrainDir(tempfile.mkdtemp(prefix=test_helpers.getCurrentMethodName()))
      .setLeafEstimationIterations(10)
      .setFirstFeatureUsePenaltiesMap(firstFeatureUsePenaltiesMap)
      .setFeatureWeightsList([1.0, 2.0, 3.0])
    )
    model = regressor.fit(pool)
    predictions = model.transform(pool.data)

    print ("predictions")
    predictions.show(truncate=False)
示例#16
0
def testWithCrossValidator():
    spark = test_helpers.getOrCreateSparkSession(
        test_helpers.getCurrentMethodName())
    import catboost_spark

    featureNames = ["f1", "f2", "f3"]

    srcDataSchema = pool_test_helpers.createSchema(
        [("features", VectorUDT()), ("label", DoubleType())],
        featureNames,
        addFeatureNamesMetadata=True)

    srcData = [
        Row(Vectors.dense(0.1, 0.2, 0.11), 1.0),
        Row(Vectors.dense(0.97, 0.82, 0.33), 2.0),
        Row(Vectors.dense(0.13, 0.22, 0.23), 2.0),
        Row(Vectors.dense(0.14, 0.18, 0.1), 1.0),
        Row(Vectors.dense(0.9, 0.67, 0.17), 2.0),
        Row(Vectors.dense(0.66, 0.1, 0.31), 1.0),
        Row(Vectors.dense(0.13, 0.21, 0.6), 1.0),
        Row(Vectors.dense(0.9, 0.82, 0.04), 2.0),
        Row(Vectors.dense(0.87, 0.92, 1.0), 2.0),
        Row(Vectors.dense(0.0, 0.1, 0.1), 1.0),
        Row(Vectors.dense(0.0, 0.78, 0.19), 1.0),
        Row(Vectors.dense(0.1, 0.33, 0.28), 2.0),
        Row(Vectors.dense(0.01, 0.5, 0.2), 1.0),
        Row(Vectors.dense(0.2, 0.99, 1.0), 1.0),
        Row(Vectors.dense(0.56, 0.43, 0.88), 2.0),
        Row(Vectors.dense(0.98, 0.02, 0.73), 2.0)
    ]

    df = spark.createDataFrame(spark.sparkContext.parallelize(srcData),
                               StructType(srcDataSchema))

    spark_cv_grid_params = pyspark.ml.tuning.ParamGridBuilder().addGrid(
        catboost_spark.CatBoostClassifier().depth, [3, 5]).build()
    estimator = catboost_spark.CatBoostClassifier(iterations=20)
    bce = pyspark.ml.evaluation.BinaryClassificationEvaluator(
        rawPredictionCol="probability", labelCol="label")
    cv = pyspark.ml.tuning.CrossValidator(
        estimator=estimator,
        estimatorParamMaps=spark_cv_grid_params,
        evaluator=bce,
        numFolds=3,
        seed=1)
    cv.fit(df)
示例#17
0
def testQuantize():
    featureNames = ["f1", "f2", "f3"]

    spark = test_helpers.getOrCreateSparkSession(
        test_helpers.getCurrentMethodName())
    import catboost_spark
    implTestQuantizeCase(
        pool_test_helpers.createSchema([("features", VectorUDT()),
                                        ("label", DoubleType())],
                                       featureNames,
                                       addFeatureNamesMetadata=True),
        srcData=[
            Row(Vectors.dense(0.0, 1.0, 0.2), 0.0),
            Row(Vectors.dense(0.1, 1.1, 2.1), 1.0),
            Row(Vectors.dense(0.2, 1.2, 2.2), 1.0),
            Row(Vectors.dense(0.0, 1.1, 3.2), 0.0)
        ],
        quantizationParams=catboost_spark.QuantizationParams())
示例#18
0
def testShapInteractionValuesForRegression():
    spark = test_helpers.getOrCreateSparkSession(test_helpers.getCurrentMethodName())
    import catboost_spark

    dataDir = os.path.join(config.CATBOOST_TEST_DATA_DIR, 'higgs')

    trainPool = catboost_spark.Pool.load(
      spark,
      dataPathWithScheme = os.path.join(dataDir, "train_small"),
      columnDescription = os.path.join(dataDir, "train.cd")
    )

    regressor = (catboost_spark.CatBoostRegressor()
      .setIterations(20)
      .setLossFunction("RMSE")
      .setTrainDir(tempfile.mkdtemp(prefix=test_helpers.getCurrentMethodName())))
    model = regressor.fit(trainPool)

    shapInteractionValuesTestCase('Regression', model, trainPool)
示例#19
0
def testShapInteractionValuesForMultiClass():
    spark = test_helpers.getOrCreateSparkSession(test_helpers.getCurrentMethodName())
    import catboost_spark

    dataDir = os.path.join(config.CATBOOST_TEST_DATA_DIR, 'cloudness_small')

    trainPool = catboost_spark.Pool.load(
      spark,
      dataPathWithScheme = os.path.join(dataDir, "train_small"),
      columnDescription = os.path.join(dataDir, "train_float.cd")
    )

    classifier = (catboost_spark.CatBoostClassifier()
      .setIterations(20)
      .setLossFunction("MultiClass")
      .setTrainDir(tempfile.mkdtemp(prefix=test_helpers.getCurrentMethodName())))
    model = classifier.fit(trainPool)

    shapInteractionValuesTestCase('MultiClass', model, trainPool)
示例#20
0
def testLoadDSVWithDelimiter():
    dataFile = pool_test_helpers.writeToTempFile("Target,Feat0,Feat1\n" +
                                                 "0,0.1,0.2\n" +
                                                 "1,0.97,0.82\n" +
                                                 "0,0.13,0.22\n")
    cdFile = pool_test_helpers.writeToTempFile("0\tTarget")

    spark = test_helpers.getOrCreateSparkSession(
        test_helpers.getCurrentMethodName())
    import catboost_spark
    pool = catboost_spark.Pool.load(
        spark,
        dataFile,
        columnDescription=cdFile,
        poolLoadParams=catboost_spark.PoolLoadParams(hasHeader=True,
                                                     delimiter=','))
    featureNames = ["_f0", "_f1"]

    pool_test_helpers.printPool(pool)
def testClassifierSerialization():
    spark = test_helpers.getOrCreateSparkSession(
        test_helpers.getCurrentMethodName())
    import catboost_spark

    serializationDir = tempfile.mkdtemp(prefix="catboost_models_")

    path = os.path.join(serializationDir, "serialized_classifier_0")

    classifier = catboost_spark.CatBoostClassifier()
    classifier.write().overwrite().save(path)
    loadedClassifier = catboost_spark.CatBoostClassifier.load(path)

    path = os.path.join(serializationDir, "serialized_classifier_1")

    classifier = (catboost_spark.CatBoostClassifier().setLossFunction(
        "MultiClass").setIterations(2))
    classifier.write().overwrite().save(path)
    loadedClassifier = catboost_spark.CatBoostClassifier.load(path)

    shutil.rmtree(serializationDir)
示例#22
0
def testQuantizeWithNaNsAndIgnoredFeatures():
    featureNames = ["F1", "F2", "F3", "F4"]

    spark = test_helpers.getOrCreateSparkSession(
        test_helpers.getCurrentMethodName())
    import catboost_spark
    implTestQuantizeCase(
        pool_test_helpers.createSchema([("features", VectorUDT()),
                                        ("label", DoubleType())],
                                       featureNames,
                                       addFeatureNamesMetadata=True),
        srcData=[
            Row(Vectors.dense(0.0, 1.0, 0.2, 100.11), 3.0),
            Row(Vectors.dense(float('nan'), 1.1, float('nan'), 20.2), 1.0),
            Row(Vectors.dense(0.2, 1.2, 2.2, 32.4), 11.0),
            Row(Vectors.dense(float('nan'), 0.0, 2.2, 71.1), 0.2),
            Row(Vectors.dense(float('nan'), 1.1, 0.4, 92.2), 6.1),
            Row(Vectors.dense(0.1, 0.0, 1.8, 111.0), 2.0),
            Row(Vectors.dense(0.28, 0.0, 8.3, 333.2), 0.0)
        ],
        quantizationParams=catboost_spark.QuantizationParams(
            borderCount=2, ignoredFeaturesIndices=[0, 2]))
示例#23
0
def createRawPool(
    appName, #:  String,
    srcDataSchema, #: Seq[StructField],
    srcData, #: Seq[Row],
    columnNames,#: Map[String, String] // standard column name to name of column in the dataset
    srcPairsData = None # Seq[Row]
):
    spark = test_helpers.getOrCreateSparkSession(test_helpers.getCurrentMethodName())
    import catboost_spark

    df = spark.createDataFrame(spark.sparkContext.parallelize(srcData), StructType(srcDataSchema))

    if srcPairsData:
        pairsDataSchema = StructType(
            [
                StructField("groupId", LongType(), False),
                StructField("winnerId", IntegerType(), False),
                StructField("loserId", IntegerType(), False)
            ]
        )
        pairsDf = spark.createDataFrame(spark.sparkContext.parallelize(srcPairsData), pairsDataSchema)
        pool = catboost_spark.Pool(df, pairsDf)
    else:
        pool = catboost_spark.Pool(df)

    if ("features" in columnNames):
        pool = pool.setFeaturesCol(columnNames["features"])
    if ("groupId" in columnNames):
        pool = pool.setGroupIdCol(columnNames["groupId"])
    if ("sampleId" in columnNames):
        pool = pool.setSampleIdCol(columnNames["sampleId"])
    if ("subgroupId" in columnNames):
        pool = pool.setSubgroupIdCol(columnNames["subgroupId"])
    if ("weight" in columnNames):
        pool = pool.setWeightCol(columnNames["weight"])
    if ("groupWeight" in columnNames):
        pool = pool.setGroupWeightCol(columnNames["groupWeight"])
    return pool
def testModelSerialization():
    spark = test_helpers.getOrCreateSparkSession(
        test_helpers.getCurrentMethodName())
    import catboost_spark

    featureNames = ["f1", "f2", "f3"]

    srcDataSchema = pool_test_helpers.createSchema(
        [("features", VectorUDT()), ("label", DoubleType())],
        featureNames,
        addFeatureNamesMetadata=True)

    srcData = [
        Row(Vectors.dense(0.1, 0.2, 0.11), 1.0),
        Row(Vectors.dense(0.97, 0.82, 0.33), 2.0),
        Row(Vectors.dense(0.13, 0.22, 0.23), 2.0),
        Row(Vectors.dense(0.14, 0.18, 0.1), 1.0),
        Row(Vectors.dense(0.9, 0.67, 0.17), 2.0),
        Row(Vectors.dense(0.66, 0.1, 0.31), 1.0)
    ]

    df = spark.createDataFrame(spark.sparkContext.parallelize(srcData),
                               StructType(srcDataSchema))

    classifier = (
        catboost_spark.CatBoostClassifier().setIterations(20).setTrainDir(
            tempfile.mkdtemp(prefix=test_helpers.getCurrentMethodName())))
    model = classifier.fit(df)
    predictions = model.transform(df)

    print("predictions")
    predictions.show(truncate=False)

    modelsDir = tempfile.mkdtemp(prefix="catboost_models_")

    nativeCatBoostModelPath = os.path.join(modelsDir,
                                           "binclass_model_on_df.cbm")
    model.saveNativeModel(nativeCatBoostModelPath)

    loadedCatBoostModel = catboost_spark.CatBoostClassificationModel.loadNativeModel(
        nativeCatBoostModelPath)
    predictionsLoadedCatBoost = loadedCatBoostModel.transform(df)
    print("predictionsLoadedCatBoost")
    predictionsLoadedCatBoost.show(truncate=False)

    nativeJsonModelPath = os.path.join(modelsDir, "binclass_model_on_df.json")
    model.saveNativeModel(nativeJsonModelPath, catboost_spark.EModelType.Json)

    nativeOnnxModelPath = os.path.join(modelsDir, "binclass_model_on_df.onnx")
    model.saveNativeModel(
        nativeOnnxModelPath, catboost_spark.EModelType.Onnx, {
            "onnx_domain": "ai.catboost",
            "onnx_model_version": 1,
            "onnx_doc_string": "test model for classification",
            "onnx_graph_name": "CatBoostModel_for_classification"
        })

    loadedOnnxModel = catboost_spark.CatBoostClassificationModel.loadNativeModel(
        nativeOnnxModelPath, catboost_spark.EModelType.Onnx)
    predictionsLoadedOnnx = loadedOnnxModel.transform(df)
    print("predictionsLoadedOnnx")
    predictionsLoadedOnnx.show(truncate=False)

    sparkModelPath = os.path.join(modelsDir, "binclass_model_on_df")
    model.write().overwrite().save(sparkModelPath)
    loadedModel = catboost_spark.CatBoostClassificationModel.load(
        sparkModelPath)

    predictionsLoaded = loadedModel.transform(df)
    print("predictionsLoaded")
    predictionsLoaded.show(truncate=False)

    shutil.rmtree(modelsDir)
def testModelSerialization():
    spark = test_helpers.getOrCreateSparkSession(test_helpers.getCurrentMethodName())
    import catboost_spark

    featureNames = ["f1", "f2", "f3"]

    srcSchemaData = [
        ("features", VectorUDT()),
        ("label", StringType()),
        ("groupId", LongType()),
        ("subgroupId", IntegerType()),
        ("weight", FloatType())
    ]

    srcData = [
        Row(Vectors.dense(0.13, 0.22, 0.23), "0.34", 0x86F1B93B695F9E6, 0x23D794E, 1.0),
        Row(Vectors.dense(0.1, 0.2, 0.11), "0.12", 0xB337C6FEFE2E2F7, 0xD34BFBD, 0.12),
        Row(Vectors.dense(0.97, 0.82, 0.33), "0.22", 0xB337C6FEFE2E2F7, 0x19CE5B0, 0.18),
        Row(Vectors.dense(0.9, 0.67, 0.17), "0.01", 0xD9DBDD3199D6518, 0x19CE5B0, 1.0),
        Row(Vectors.dense(0.66, 0.1, 0.31), "0.0", 0xD9DBDD3199D6518, 0x1FA606F, 2.0),
        Row(Vectors.dense(0.14, 0.18, 0.1), "0.42", 0xD9DBDD3199D6518, 0x62772D1, 0.45)
    ]
    pool = pool_test_helpers.createRawPool(
        test_helpers.getCurrentMethodName,
        pool_test_helpers.createSchema(
          srcSchemaData,
          featureNames,
          addFeatureNamesMetadata=True
        ),
        srcData,
        {"groupId": "groupId", "subgroupId": "subgroupId", "weight": "weight"}
    )

    regressor = (catboost_spark.CatBoostRegressor()
      .setIterations(20)
      .setTrainDir(tempfile.mkdtemp(prefix=test_helpers.getCurrentMethodName())))
    model = regressor.fit(pool)
    predictions = model.transform(pool.data)

    print ("predictions")
    predictions.show(truncate=False)

    modelsDir = tempfile.mkdtemp(prefix="catboost_models_")

    nativeCatBoostModelPath = os.path.join(modelsDir, "regressor_model.cbm")
    model.saveNativeModel(nativeCatBoostModelPath)

    loadedCatBoostModel = catboost_spark.CatBoostRegressionModel.loadNativeModel(nativeCatBoostModelPath)
    predictionsLoadedCatBoost = loadedCatBoostModel.transform(pool.data)
    print ("predictionsLoadedCatBoost")
    predictionsLoadedCatBoost.show(truncate=False)

    nativeJsonModelPath = os.path.join(modelsDir, "regressor_model.json")
    model.saveNativeModel(nativeJsonModelPath, catboost_spark.EModelType.Json)

    nativeOnnxModelPath = os.path.join(modelsDir, "regressor_model.onnx")
    model.saveNativeModel(
        nativeOnnxModelPath,
        catboost_spark.EModelType.Onnx,
        {
            "onnx_domain": "ai.catboost",
            "onnx_model_version": 1,
            "onnx_doc_string": "test model for regression",
            "onnx_graph_name": "CatBoostModel_for_regression"
        }
    )

    loadedOnnxModel = catboost_spark.CatBoostRegressionModel.loadNativeModel(nativeOnnxModelPath, catboost_spark.EModelType.Onnx)
    predictionsLoadedOnnx = loadedOnnxModel.transform(pool.data)
    print ("predictionsLoadedOnnx")
    predictionsLoadedOnnx.show(truncate=False)

    sparkModelPath = os.path.join(modelsDir, "regressor_model")
    model.write().overwrite().save(sparkModelPath)
    loadedModel = catboost_spark.CatBoostRegressionModel.load(sparkModelPath)

    predictionsLoaded = loadedModel.transform(pool.data)
    print ("predictionsLoaded")
    predictionsLoaded.show(truncate=False)

    shutil.rmtree(modelsDir)