def _handle_missing(df): from pyspark.ml.feature import Imputer from pyspark.sql import functions as F # handle missing values columns = list( filter(lambda col: col not in ('class', 'weight', 'crime_pair'), df.columns)) dtypes = dict(df.dtypes) # for int columns int_columns = list( filter(lambda col: dtypes[col] not in ('float', 'double'), columns)) stats = df.agg(*(F.avg(c).alias(c) for c in int_columns)) fillers = { k: round(v) for k, v in stats.first().asDict().items() if v is not None } df = df.na.fill(fillers) # for float columns float_columns = list( filter(lambda col: dtypes[col] in ('float', 'double'), columns)) print(float_columns) imputer = Imputer( inputCols=float_columns, outputCols=["{}_imputed".format(c) for c in float_columns]) df = imputer.fit(df).transform(df) df = df.drop(*float_columns) return df
def cleanDraftData(postion): ''' [X] need to fill in nulls for the Age with the AVG, or with the median of all the ages --> opted out for the medium ''' unCleanData = spark.read.format("csv").option("header", "true").option( "inferSchema", "true").load("./data/NflDraftData/draftData.csv") # drop columns we don't need unCleanData = unCleanData.select("Rnd", "Pick", "Player Name", "Pos", 'Age', 'College', 'Draft Year') if (postion == "RB" or postion == "QB" or postion == "WR"): unCleanData = unCleanData.where(unCleanData["Pos"] == postion) else: # Retrun all of the skill offensive players (WR, RB, TE, QB, FB) #drop lineman both offense and defense as well as defensive players and special teams droppedPostions = [ 'DE', 'DT', 'T', 'O', 'G', 'C', 'K', 'NT', 'DL', 'OL', 'LS', 'LB', 'DB', 'P', 'OLB', 'CB', 'S', 'ILB' ] # With only O players we are down to 2000 data pints for postion in droppedPostions: unCleanData = unCleanData.where(unCleanData["Pos"] != postion) # Cast values to doubles doubleCols = ['Age', 'Rnd', 'Pick', 'Draft Year'] for c in doubleCols: unCleanData = unCleanData.withColumn(c, unCleanData[c].cast(DoubleType())) # Used to fill in Null values with the medium imputer = Imputer(inputCols=["Age"], outputCols=["Age"]) cleanData = imputer.setStrategy("median").fit(unCleanData).transform( unCleanData) #cleanData.show() return cleanData
def impute_missing(df, columns, out_cols, strategy='mean'): """ Imputes missing data from specified columns using the mean or median. Parameters ---------- columns : List of columns to be analyze. out_cols: List of output columns with missing values imputed. strategy: String that specifies the way of computing missing data. Can be "mean" or "median" return : Transformer object (DF with columns that has the imputed values). """ # Check if columns to be process are in dataframe assert_cols_in_df(df, columns_provided=columns, columns_df=df.columns) assert isinstance(columns, list), "Error: columns argument must be a list" assert isinstance(out_cols, list), "Error: out_cols argument must be a list" # Check if columns argument a string datatype: assert_type_str(df, strategy, "strategy") assert ( strategy == "mean" or strategy == "median" ), "Error: strategy has to be 'mean' or 'median'. 'mean' is default" imputer = Imputer(inputCols=columns, outputCols=out_cols) model = imputer.setStrategy(strategy).fit(df) df = model.transform(df) return df
def imputeMonthlyIncome(df): imputer = Imputer(inputCols=['MonthlyIncome'], outputCols=['imputed_MonthlyIncome'], strategy='median') # Columns are required to either double or float by the Imputer... df = df.withColumn( 'double_MonthlyIncome', df.MonthlyIncome.cast(DoubleType()) ).drop('MonthlyIncome') \ .withColumnRenamed('double_MonthlyIncome', 'MonthlyIncome') df = imputer.fit(df).transform(df).drop('MonthlyIncome') df = df.withColumnRenamed('imputed_MonthlyIncome', 'MonthlyIncome') # Addressing MonthlyIncome of 0 incomeMedian = np.median(df.select('MonthlyIncome').collect()) # Apply income median if the MonthlyIncome is 0 df = df.withColumn( 'MonthlyIncome', F.when((F.col('MonthlyIncome') == 1), incomeMedian).otherwise(F.col('MonthlyIncome'))) return df
def _imputer_test_single(self): data = self.spark.createDataFrame([(1.0, float("nan")), (2.0, float("nan")), (float("nan"), 3.0), (4.0, 4.0), (5.0, 5.0)], ["a", "b"]) imputer = Imputer(inputCols=["a"], outputCols=["out_a"]) model = imputer.fit(data) # the input name should match the inputCols above model_onnx = convert_sparkml(model, 'Sparkml Imputer', [('a', FloatTensorType([None, 1]))]) self.assertTrue(model_onnx is not None) # run the model predicted = model.transform(data) expected = predicted.select("out_a").toPandas().values.astype( numpy.float32) data_np = data.toPandas().a.values.astype(numpy.float32) data_np = data_np.reshape((-1, 1)) paths = save_data_models(data_np, expected, model, model_onnx, basename="SparkmlImputerSingle") onnx_model_path = paths[-1] output, output_shapes = run_onnx_model(['out_a'], data_np, onnx_model_path) compare_results(expected, output, decimal=5)
def preprocess(df): cont_col = ['_c{0}'.format(i) for i in range(0, 14)] for i in cont_col: df = df.withColumn(i, df[i].cast("float")) # Continuous columns fill null with mean imputer=Imputer(inputCols=cont_col,outputCols=cont_col).setStrategy('mean') return imputer.fit(df).transform(df)
def imputaciones(VarLimpias): C = [i[0] for i in VarLimpias.dtypes if 'string' in i[1]] I = [i[0] for i in VarLimpias.dtypes if 'int' in i[1]] for f in I: VarLimpias = VarLimpias.withColumn(f, VarLimpias[f].cast(DoubleType())) imputer = Imputer(inputCols=[c for c in VarLimpias.columns if c not in C], outputCols=[c for c in VarLimpias.columns if c not in C]) Pba = imputer.fit(VarLimpias) return Pba.transform(VarLimpias)
def imputation(self): C=[i[0] for i in self.data.dtypes if 'string' in i[1]] I=[i[0] for i in self.data.dtypes if 'int' in i[1]] for f in I: self.data = self.data.withColumn(f, self.data[f].cast(DoubleType())) imputer = Imputer( inputCols= [c for c in self.data.columns if c not in C], outputCols=[c for c in self.data.columns if c not in C]) Pba=imputer.fit(self.data) return Pba.transform(self.data)
def na_imputer(self, strategy, out_columns="*", na=None, columns="*"): """ replace missing value with mean or median according to users' choice user can also customize the definition of missing value, e.g. 999 the default missing value is 'nan' or null the default setting of out_columns is just columns, so the original columns will be overrided if not specially defined """ #check columns if columns == "*": columns = self._df.schema.names elif isinstance(columns, str): columns = [columns] else: assert isinstance( columns, list), "Error: columns argument must be a string or a list!" if out_columns == "*": out_columns = self._df.schema.names #check output columns if isinstance(out_columns, str): out_columns = [out_columns] else: assert isinstance( out_columns, list ), "Error: output columns argument must be a string or a list!" #check input and output columns have consistent lengths assert len(columns) == len( out_columns ), "Error: inconsistent lengths for argument of columns list and output columns list" #check strategy argument assert (strategy == "mean" or strategy == "median"), "Error: strategy can only be 'mean' or 'median'." #firstly convert the type in input columns to FloatType for Imputer for col in columns: self._df = self._df.withColumn(col, self._df[col].cast(FloatType())) #fit the model imputer = Imputer(inputCols=columns, outputCols=out_columns) if na is None: model = imputer.setStrategy(strategy).fit(self._df) else: model = imputer.setStrategy(strategy).setMissingValue(na).fit( self._df) self._df = model.transform(self._df) return self._df
def imputer_continuous_features(df, data_types_map): continuous_features = list( set(data_types_map['DoubleType']) - set(['DEP_DEL15'])) continuous_features_imputed = [ var + "_imputed" for var in continuous_features ] imputer = Imputer(inputCols=continuous_features, outputCols=continuous_features_imputed) tmp = imputer.fit(df).transform(df) get_missing_info(tmp) return [imputer]
def imputers(dataframe): inputCols = [] outputCols = [] for i in range(1,14): feature = 'I-'+str(i) dataframe = dataframe.withColumn(feature, dataframe[feature].cast(DoubleType())) inputCols.append(feature) outputCols.append(feature) imputer = Imputer(strategy="mean", inputCols=inputCols, outputCols=outputCols) return imputer.fit(dataframe).transform(dataframe)
def fill_na_numerical(self,data,columns): ''' FILL NULL VALUES FOR NUMERICAL DATA args: 1.data: <SPARK DATAFRAME> actual spark dataframe 2.columns: <LIST> of numerical columns we want to Impute return: <SPARK DATAFRAME>Imputed spark dataframe ''' columns=list(columns) imputer=Imputer(inputCols=columns,outputCols=['imputed_'+str(col) for col in columns]) dataCopy=imputer.fit(data).transform(data) return dataCopy
def fill_na_numerical(self, data, columns): ''' Purpose: Fill null values for numerical data Inputs : Data(spark dataframe), column(numerical columns) Output : Imputed spark dataframe ''' columns = list(columns) imputer = Imputer( inputCols=columns, outputCols=['imputed_' + str(col) for col in columns]) dataCopy = imputer.fit(data).transform(data) return dataCopy
def imputer_usecase(): """ 用于计算数据集中的缺失值,使用指定的策略进行数据填充, strategy指定数据填充策略, """ spark = getSparkSession() df = spark.createDataFrame([(1.0, float("nan")), (2.0, float("nan")), (float("nan"), 3.0), (4.0, 4.0), (5.0, 5.0)], ["a", "b"]) imputer = Imputer(inputCols=["a", "b"], outputCols=["out_a", "out_b"]) model = imputer.fit(df) model.transform(df).show()
def preprocessing(self): model = GBTRegressor(labelCol="bicycle_rentals") cols = [ "part_time", "holiday", "week_days", "weather_description_mf", "month" ] imputer = Imputer(inputCols=["humidity", "pressure"], outputCols=["humidity_input", "pressure_input"]) indexers = [ StringIndexer(inputCol=col, outputCol="{0}_indexed".format(col)) for col in cols ] assembler = VectorAssembler(inputCols=[ "part_time_indexed", "holiday_indexed", "month_indexed", "week_days_indexed", "weather_description_mf_indexed", "humidity_input", "pressure_input", "temperature", "wind_speed", "from_station_id", "mean_dpcapacity_start", "mean_dpcapacity_end", "sum_subscriber", "sum_customer" ], outputCol="features") pipeline = Pipeline(stages=[imputer] + indexers + [assembler] + [model]) return pipeline
def prepocess_data(df): # Preprocessing the data # Dimension reduction cols_reduce = [ 'Date', 'Time', 'Sub_metering_1', 'Sub_metering_2', 'Sub_metering_3' ] df = df.drop(*cols_reduce) # Fixing missing values (dataset uses ? as NaN for missing values) imputer = Imputer(inputCols=df.columns, outputCols=df.columns) imputer.setStrategy("mean") df = imputer.fit(df).transform(df) # Print the column name and datatype print(df.dtypes) return df
def get_ml1_pipeline(): stages = [] imputer = Imputer(inputCols=ML1_NUMERICAL_COLUMNS , outputCols=ML1_NUMERICAL_COLUMNS ) stages.append(imputer) ohe_input_cols = [] ohe_output_cols = [] for categorical_column in ML1_CATEGORICAL_COLUMNS: str_indexer = StringIndexer(inputCol=categorical_column, outputCol=categorical_column + "_index", handleInvalid='keep') ohe_input_cols.append(str_indexer.getOutputCol()) ohe_output_cols.append(categorical_column + "_class_vec") stages.append(str_indexer) encoder = OneHotEncoderEstimator(inputCols=ohe_input_cols, outputCols=ohe_output_cols, handleInvalid="error", dropLast=False) stages.append(encoder) numerical_vector_assembler = VectorAssembler(inputCols=ML1_NUMERICAL_COLUMNS , outputCol="numerial_cols_vec", handleInvalid="keep") scaler = MinMaxScaler(inputCol="numerial_cols_vec", outputCol= "scaled_numerical_cols") stages.append(numerical_vector_assembler) stages.append(scaler) label_str_indexer = StringIndexer(inputCol="result", outputCol="label", handleInvalid="keep") stages.append(label_str_indexer) assembler_input = encoder.getOutputCols() + [scaler.getOutputCol()] assembler = VectorAssembler(inputCols= assembler_input, outputCol="features", handleInvalid="skip") stages.append(assembler) pipeline = Pipeline(stages = stages) return pipeline
def _fit_crossvalidator(train, features, target): """ Helper function that fits a CrossValidator model to predict a binary label `target` on the passed-in training DataFrame using the columns in `features` :param: train: Spark DataFrame containing training data :param: features: List of strings containing column names to use as features from `train` :param: target: String name of binary target column of `train` to predict """ train = train.select(features + [target]) model_matrix_stages = [ Imputer(inputCols=features, outputCols=features), VectorAssembler(inputCols=features, outputCol="features"), StringIndexer(inputCol="bad_loan", outputCol="label") ] lr = LogisticRegression(maxIter=10, elasticNetParam=0.5, featuresCol="features") pipeline = Pipeline(stages=model_matrix_stages + [lr]) paramGrid = ParamGridBuilder().addGrid(lr.regParam, [0.1, 0.01]).build() crossval = CrossValidator(estimator=pipeline, estimatorParamMaps=paramGrid, evaluator=BinaryClassificationEvaluator(), numFolds=5) with mlflow.start_run(): mlflow.log_param("data_version", version_to_load) mlflow.log_param("data_path", DELTA_TABLE_DEFAULT_PATH) cvModel = crossval.fit(train) return cvModel.bestModel
def handle_missing(self, non_feature_col=["ID", "TIME_SPAN"]): import pyspark if type(self) == data_run_experiment: raise NotImplementedError( "Method need to be called in sub-class but currently called in base class" ) try: ret_data_frame = self.spark.read.parquet(self.temp_missing_drop) self.logger.info(self.temp_missing_drop) return ret_data_frame except pyspark.sql.utils.AnalysisException as ex: template = "An exception of type {0} occurred. Arguments:\n{1!r}" message = template.format(type(ex).__name__, ex.args) self.logger.info(message) self.logger.info("PROCESS") #impute only. aggregation will be done after adding demographics cur_df = self.spark.read.parquet(self.out_file_name) cur_cols = cur_df.columns categorical_cols = list() numerical_cols = list() for i in non_feature_col: cur_cols.remove(i) for i in cur_cols: if i.find("C_") == 0: categorical_cols.append(i) else: numerical_cols.append(i) cur_df = cur_df.fillna( 0, subset=categorical_cols).repartition(400).checkpoint() self.logger.info(cur_df.count()) from pyspark.ml.feature import Imputer imputedCols = ["imp_{0}".format(x) for x in numerical_cols] imputer = Imputer(inputCols=numerical_cols, outputCols=imputedCols).setStrategy("mean") imputer_model = imputer.fit(cur_df) ret_data_frame = imputer_model.transform(cur_df) ret_data_frame.select(non_feature_col + imputedCols + categorical_cols).show() ret_data_frame.select(non_feature_col + imputedCols + categorical_cols).write.save( self.temp_missing_drop) ret_data_frame = self.spark.read.parquet(self.temp_missing_drop) return ret_data_frame
def _fit_crossvalidator(train, features, target, version): """ Helper function that fits a CrossValidator model to predict a binary label `target` on the passed-in training DataFrame using the columns in `features` :param: train: Spark DataFrame containing training data :param: features: List of strings containing column names to use as features from `train` :param: target: String name of binary target column of `train` to predict """ train = train.select(features + [target]) model_matrix_stages = [ Imputer(inputCols=features, outputCols=features), VectorAssembler(inputCols=features, outputCol="features"), StringIndexer(inputCol="bad_loan", outputCol="label") ] lr = LogisticRegression(maxIter=10, elasticNetParam=0.5, featuresCol="features") pipeline = Pipeline(stages=model_matrix_stages + [lr]) paramGrid = ParamGridBuilder().addGrid(lr.regParam, [0.1, 0.01]).build() crossval = CrossValidator(estimator=pipeline, estimatorParamMaps=paramGrid, evaluator=BinaryClassificationEvaluator(), numFolds=5) import matplotlib.pyplot as plt from mlflow import spark as mlflow_spark from mlflow import sklearn as mlflow_sk mlflow.start_run() cvModel = crossval.fit(train) best_model = cvModel.bestModel roc = best_model.stages[len(best_model.stages) - 1].summary.roc.toPandas() fig1 = plt.figure() fig1.clf() plt.clf() plt.plot(roc['FPR'], roc['TPR']) plt.ylabel('False Positive Rate') plt.xlabel('True Positive Rate') plt.title('ROC Curve') fig1.savefig("roc.png") mlflow.log_artifact("roc.png") fig1.clf() plt.clf() lr_summary = best_model.stages[len(best_model.stages) - 1].summary mlflow.log_metric("accuracy", lr_summary.accuracy) mlflow.log_metric("weightedFalsePositiveRate", lr_summary.weightedFalsePositiveRate) mlflow.log_metric("weightedFalsePositiveRate", lr_summary.weightedFalsePositiveRate) mlflow.log_metric("weightedFMeasure", lr_summary.weightedFMeasure()) mlflow.log_metric("weightedPrecision", lr_summary.weightedPrecision) mlflow.log_metric("weightedRecall", lr_summary.weightedRecall) mlflow_spark.log_model(best_model, "loan-classifier-mllib") mlflow.end_run() return best_model
def imputer_mean(df): weather_numeric_with_nulls = [ 'origin_WND_speed_rate', 'origin_CIG_ceiling_height', 'origin_VIS_distance', 'origin_TMP_air_temperature', 'origin_DEW_dew_point_temp', 'dest_WND_speed_rate', 'dest_CIG_ceiling_height', 'dest_VIS_distance', 'dest_TMP_air_temperature', 'dest_DEW_dew_point_temp', 'origin_aa1_rain_depth', 'dest_aa1_rain_depth', 'origin_aj1_snow_depth', 'dest_aj1_snow_depth' ] imputer = Imputer(inputCols=weather_numeric_with_nulls, outputCols=weather_numeric_with_nulls) model = imputer.fit(filter_to_train(df)) df = model.transform(df) return df
def impute(input_cols, output_cols, strategy="mean"): """ Imputes missing data from specified columns using the mean or median. :param input_cols: List of columns to be analyze. :param output_cols: List of output columns with missing values imputed. :param strategy: String that specifies the way of computing missing data. Can be "mean" or "median" :return: Dataframe object (DF with columns that has the imputed values). """ input_cols = parse_columns(self, input_cols) output_cols = val_to_list(output_cols) imputer = Imputer(inputCols=input_cols, outputCols=output_cols) df = self model = imputer.setStrategy(strategy).fit(df) df = model.transform(df) return df
def preprocessing(self, trainDF, validDF, testDF): """ Data preprocessing steps involving the following transformations: 1. One-Hot encoding of categorical variables 2. Imputation of missing values in numerical variables 3. Standardization of numerical variables Parameters ----------- trainDF: training data set validDF: test data set testDF: test data set Returns ----------- Transformed training and test data sets with the assembler vector """ # Extract numerical and categorical column names cat_cols = [field for (field, dataType) in trainDF.dtypes if dataType == "string"] num_cols = [field for (field, dataType) in trainDF.dtypes if ((dataType == "double") & \ (field != self.label_col))] # Create output columns index_output_cols = [x + "Index" for x in cat_cols] ohe_output_cols = [x + "OHE" for x in cat_cols] # num_output_cols = [x + "scaled" for x in num_cols] # strinf indexer for categorical variables s_indexer = StringIndexer(inputCols = cat_cols, outputCols = index_output_cols, handleInvalid="skip") # One-hot code categorical columns cat_encoder = OneHotEncoder(inputCols = index_output_cols, outputCols = ohe_output_cols) # Impute missing values in numerical columns num_imputer = Imputer(inputCols = num_cols, outputCols = num_cols) # Vector assembler assembler_inputs = ohe_output_cols + num_cols assembler = VectorAssembler(inputCols = assembler_inputs, outputCol = "unscaled_features") # Features scaling using StandardScaler scaler = StandardScaler(inputCol = assembler.getOutputCol(), outputCol = "features") # Create pipeline stages = [s_indexer, cat_encoder, num_imputer, assembler, scaler] pipeline = Pipeline(stages = stages) pipelineModel = pipeline.fit(trainDF) # Preprocess training and test data trainDF_scaled = pipelineModel.transform(trainDF) validDF_scaled = pipelineModel.transform(validDF) testDF_scaled = pipelineModel.transform(testDF) return assembler, trainDF_scaled, validDF_scaled, testDF_scaled
def missing_val_imput(self): check = self.input.select(*(sum(col(c).isNull().cast("int")).alias(c) for c in self.input.columns)) check.show() print("||| Above table shows missing values accross columns |||") check_pd = self.input.toPandas() val = check_pd.isnull().any().any() if val == True: imputer = Imputer( inputCols=self.input.columns, outputCols=["{}".format(c) for c in self.input.columns]) cleaned_input = imputer.fit(self.input).transform(self.input) print("Missing values replaced with mean accross columns") print("Returning cleaned data") return cleaned_input else: print("No missing value found") return self.input
def replace_missings(self, test=False): """ Replace missing values with a default value """ for col in list(self.config_dict.keys()): # check if the replace missing transformation needs to be applied if self.config_dict[col]["replace_missings"]["apply"]: imputer = Imputer( inputCols=[col], outputCols=[ "{}_replace_missings".format(col) ]).setMissingValue( self.config_dict[col]["replace_missings"]["value"]) if test: self.test_data = imputer.fit(self.test_data).transform( self.test_data) else: self.train_data = imputer.fit(self.train_data).transform( self.train_data)
def imputeNumeric(numeric_DF): ''' takes a spark df with continuous numeric columns outputs a spark df where all null values are replaced with the column average the first column, which is the outcome values, are preserved ''' outputColumns=["{}".format(c) for c in numeric_DF.columns[1:11]] catColumns = ["{}".format(c) for c in numeric_DF.columns[11:]] imputer = Imputer( inputCols=numeric_DF.columns[1:11], outputCols=["{}".format(c) for c in numeric_DF.columns[1:11]] ) model = imputer.fit(numeric_DF) imputedDF = model.transform(numeric_DF).select(['_1']+outputColumns+catColumns) return imputedDF
def impute(self): from pyspark.ml.feature import Imputer df = self.session.createDataFrame([(1.0, float("nan")), (2.0, float("nan")), (float("nan"), 3.0), (4.0, 4.0), (5.0, 5.0)], ["a", "b"]) # 默认采用平均值进行填充 imputer = Imputer(inputCols=["a", "b"], outputCols=["out_a", "out_b"]) model = imputer.fit(df) model.transform(df).show() # 我们也可以设置为中位数,以及判定哪些是缺失值 # null 则自动被认为缺失值 imputer = Imputer(inputCols=["a", "b"], outputCols=["out_a", "out_b"], strategy="median", missingValue=float("nan")) model = imputer.fit(df) model.transform(df).show() ## fit过程一般我们认为是一个学习的过程,我们也可以吧这个过程保留下来 ## 遗憾的是,我们暂时没有办法变更参数 model.write().overwrite().save("/tmp/wow") model = ImputerModel.read().load("/tmp/wow") model.transform(df).show()
def main(self, sc, *args): """ For each input files, i.e. train and test 'initiated, apply the same set of transformatons """ sqlContext = SQLContext(sc) # For each key in the output dictionary of the Initiate task, i.e. train and test for inputFile in Initiate(self.input_file, self.output_path).output(): df = sqlContext.read.csv(Initiate( self.input_file, self.output_path).output()[inputFile].path, sep=",", header=True, inferSchema=True) # Select final list of features list_features = ["Age", "Sex_indexed", "Fare", "Survived"] df = df.select(*list_features) # Replace missing values cols_missing = ["Age"] for col in cols_missing: imputer = Imputer(inputCols=[col], outputCols=[ "{}_replace_missings".format(col) ]).setMissingValue(26.0) df = imputer.fit(df).transform(df) # Discretize cols_disc = { "Age_replace_missings": [-math.inf, 0.83, 21.0, 26.0, 33.0, 71.0, math.inf], "Fare": [-math.inf, 7.225, 8.122, 26.0, 83.475, math.inf], } for col in cols_disc: bucketizer = Bucketizer(splits=cols_disc[col], inputCol=col, outputCol="{}_discretized".format(col)) df = bucketizer.transform(df) df.write.csv(self.output()[inputFile].path, header=True)
def impute(columns, strategy="mean"): """ Imputes missing data from specified columns using the mean or median. :param columns: List of columns to be analyze. :param strategy: String that specifies the way of computing missing data. Can be "mean" or "median" :return: Dataframe object (DF with columns that has the imputed values). """ columns = parse_columns(self, columns, filter_by_column_dtypes=PYSPARK_NUMERIC_TYPES) df = self output_cols = [] for col_name in columns: # Imputer require not only numeric but float or double df = df.cols.cast(col_name, "float") output_cols.append(col_name + IMPUTE_SUFFIX) imputer = Imputer(inputCols=columns, outputCols=output_cols) model = imputer.setStrategy(strategy).fit(df) df = model.transform(df) return df
def imputer(features_name, strategy="mean", missing_value=None, footer="_imputer"): """ Spark experiment method Args: features_name: strategy: missing_value: footer: Returns: """ output_names = [name+footer for name in features_name] imputer = Imputer() \ .setInputCols(features_name) \ .setOutputCols(output_names)\ .setStrategy(strategy) if missing_value: imputer.setMissingValue(missing_value) return imputer
Run with: bin/spark-submit examples/src/main/python/ml/imputer_example.py """ # $example on$ from pyspark.ml.feature import Imputer # $example off$ from pyspark.sql import SparkSession if __name__ == "__main__": spark = SparkSession\ .builder\ .appName("ImputerExample")\ .getOrCreate() # $example on$ df = spark.createDataFrame([ (1.0, float("nan")), (2.0, float("nan")), (float("nan"), 3.0), (4.0, 4.0), (5.0, 5.0) ], ["a", "b"]) imputer = Imputer(inputCols=["a", "b"], outputCols=["out_a", "out_b"]) model = imputer.fit(df) model.transform(df).show() # $example off$ spark.stop()