def test_aft_regression_survival(self): data = self.spark.createDataFrame( [(1.0, Vectors.dense(1.0), 1.0), (1e-40, Vectors.sparse(1, [], []), 0.0)], ["label", "features", "censor"]) gbt = AFTSurvivalRegression() model = gbt.fit(data) feature_count = data.first()[1].size model_onnx = convert_sparkml( model, 'Sparkml AFTSurvivalRegression', [('features', FloatTensorType([1, feature_count]))], spark_session=self.spark) self.assertTrue(model_onnx is not None) # run the model predicted = model.transform(data) data_np = data.toPandas().features.apply( lambda x: pandas.Series(x.toArray())).values.astype(numpy.float32) expected = [ predicted.toPandas().prediction.values.astype(numpy.float32), ] paths = save_data_models(data_np, expected, model, model_onnx, basename="SparkmlAFTSurvivalRegression") onnx_model_path = paths[3] output, output_shapes = run_onnx_model(['prediction'], data_np, onnx_model_path) compare_results(expected, output, decimal=5)
def survival_regression(trainingDataFrame, quantileProbabilities=[0.3, 0.6], quantilesCol="quantiles"): aft = AFTSurvivalRegression(quantileProbabilities=quantileProbabilities, quantilesCol=quantilesCol) aftModel = aft.fit(trainingDataFrame) result = {} result["model"] = aftModel result["intercept"] = aftModel.intercept result["coefficients"] = aftModel.coefficients result["scale"] = aftModel.scale return result
def aftsurvivalRegression(df, conf): """ AFT Survival Regression training Input : - Dataframe of training (df) - tuning and hiperparameter configuration (conf) output : - AFT survival regression model (model) """ feature_col = conf["params"].get("featuresCol", "features") label_col = conf["params"].get("labelCol", "label") pred_col = conf["params"].get("predictionCol", "prediction") cens_col = conf["params"].get("censorCol", "censor") fit_intercept = conf["params"].get("fitIntercept",True) max_iter = conf["params"].get("maxIter", 100) tol = conf["params"].get("tol", ) quant_p = conf["params"].get("quantileProbabilities", [0.01, 0.05, 0.1, 0.25, 0.5, 0.75, 0.9, 0.95, 0.99]) quant_col = conf["params"].get("quantilesCol", None) agg_depth = conf["params"].get("aggregationDepth", 2) afts = AFTSurvivalRegression(featuresCol=feature_col,labelCol=label_col, predictionCol=pred_col, censorCol=cens_col, maxIter=max_iter, fitIntercept=fit_intercept, tol=tol, aggregationDepth=agg_depth) if conf["tuning"]: if conf["tuning"].get("method").lower() == "crossval": folds = conf["tuning"].get("methodParam", 2) # Set the hiperparameter that we want to grid, incase: maxIter and aggregationDepth paramGrids = conf["tuning"].get("paramGrids") pg=ParamGridBuilder() for key in paramGrids: pg.addGrid(key, paramGrids[key]) grid = pg.build() evaluator = RegressionEvaluator() cv = CrossValidator(estimator=afts, estimatorParamMaps=grid, evaluator=evaluator, numFolds=folds) model = cv.fit(df) elif conf["tuning"].get("method").lower() == "trainvalsplit": tr = conf["tuning"].get("methodParam", 0.8) # Set the hiperparameter that we want to grid, incase: maxIter and aggregationDepth paramGrids = conf["tuning"].get("paramGrids") pg=ParamGridBuilder() for key in paramGrids: pg.addGrid(key, paramGrids[key]) grid = pg.build() evaluator = RegressionEvaluator() tvs = TrainValidationSplit(estimator=afts, estimatorParamMaps=grid, evaluator=evaluator, trainRatio=tr) model = tvs.fit(df) elif conf["tuning"] == None: model = afts.fit(df) return model
""" if __name__ == "__main__": spark = SparkSession \ .builder \ .appName("PythonAFTSurvivalRegressionExample") \ .getOrCreate() # $example on$ training = spark.createDataFrame( [(1.218, 1.0, Vectors.dense(1.560, -0.605)), (2.949, 0.0, Vectors.dense(0.346, 2.158)), (3.627, 0.0, Vectors.dense(1.380, 0.231)), (0.273, 1.0, Vectors.dense(0.520, 1.151)), (4.199, 0.0, Vectors.dense(0.795, -0.226))], ["label", "censor", "features"]) quantileProbabilities = [0.3, 0.6] aft = AFTSurvivalRegression(quantileProbabilities=quantileProbabilities, quantilesCol="quantiles") model = aft.fit(training) # Print the coefficients, intercept and scale parameter for AFT survival regression print("Coefficients: " + str(model.coefficients)) print("Intercept: " + str(model.intercept)) print("Scale: " + str(model.scale)) model.transform(training).show(truncate=False) # $example off$ spark.stop()
bin/spark-submit examples/src/main/python/ml/aft_survival_regression.py """ if __name__ == "__main__": spark = SparkSession.builder.appName("PythonAFTSurvivalRegressionExample").getOrCreate() # $example on$ training = spark.createDataFrame( [ (1.218, 1.0, Vectors.dense(1.560, -0.605)), (2.949, 0.0, Vectors.dense(0.346, 2.158)), (3.627, 0.0, Vectors.dense(1.380, 0.231)), (0.273, 1.0, Vectors.dense(0.520, 1.151)), (4.199, 0.0, Vectors.dense(0.795, -0.226)), ], ["label", "censor", "features"], ) quantileProbabilities = [0.3, 0.6] aft = AFTSurvivalRegression(quantileProbabilities=quantileProbabilities, quantilesCol="quantiles") model = aft.fit(training) # Print the coefficients, intercept and scale parameter for AFT survival regression print("Coefficients: " + str(model.coefficients)) print("Intercept: " + str(model.intercept)) print("Scale: " + str(model.scale)) model.transform(training).show(truncate=False) # $example off$ spark.stop()
def train_model(training): quantileProbabilities = [0.3, 0.6] aft = AFTSurvivalRegression(quantileProbabilities=quantileProbabilities, quantilesCol="quantiles") model = aft.fit(training) return model
# Build the model dataset = dataset.withColumn("censor",lit(1)) dataset = dataset.withColumn("label",lit(1)) # values = dataset.select('label').collect() # print(values) #result = parsedData.collect() #print(result[0]) #print(parsedData.collect()) #training = spark.createDataFrame(dataset) #training = parsedData.toDF() quantileProbabilities = [0.3, 0.6] aft = AFTSurvivalRegression(quantileProbabilities=quantileProbabilities, quantilesCol="quantiles") model = aft.fit(dataset) #print("Coefficients: " + str(model.coefficients)) #print("Intercept: " + str(model.intercept)) #print("Scale: " + str(model.scale)) # Evaluate the model on training data # valuesAndPreds = parsedData.map(lambda p: (p.label, model.predict(p.features))) # MSE = valuesAndPreds \ # .map(lambda (v, p): (v - p)**2) \ # .reduce(lambda x, y: x + y) / valuesAndPreds.count() # print("Mean Squared Error = " + str(MSE)) # Save and load model