def train(self, df): df = self.build_features_vectors(df) scaler = StandardScaler() scaler.setInputCol(self.features_values_column) scaler.setOutputCol(self.features_values_scaled) scaler.setWithMean(self.scaler_with_mean) scaler.setWithStd(self.scaler_with_std) self.scaler_model = scaler.fit(df) df = self.scaler_model.transform(df).persist( StorageLevelFactory.get_storage_level(self.storage_level)) if len(self.categorical_features): self._create_indexes(df) self._add_categorical_features(df, self.features_values_scaled) iforest = IForest( featuresCol=self.features_values_scaled, predictionCol=self.prediction_column, # anomalyScore=self.score_column, numTrees=self.num_trees, maxSamples=self.max_samples, maxFeatures=self.max_features, maxDepth=self.max_depth, contamination=self.contamination, bootstrap=self.bootstrap, approxQuantileRelativeError=self. approximate_quantile_relative_error, # numCategoricalFeatures=len(self.categorical_features) ) iforest.setSeed(self.seed) params = {'threshold': self.threshold} self.iforest_model = iforest.fit(df, params) df.unpersist()