def impute(self):
        from pyspark.ml.feature import Imputer

        df = self.session.createDataFrame([(1.0, float("nan")),
                                           (2.0, float("nan")),
                                           (float("nan"), 3.0), (4.0, 4.0),
                                           (5.0, 5.0)], ["a", "b"])

        # 默认采用平均值进行填充
        imputer = Imputer(inputCols=["a", "b"], outputCols=["out_a", "out_b"])
        model = imputer.fit(df)
        model.transform(df).show()

        # 我们也可以设置为中位数,以及判定哪些是缺失值
        # null 则自动被认为缺失值
        imputer = Imputer(inputCols=["a", "b"],
                          outputCols=["out_a", "out_b"],
                          strategy="median",
                          missingValue=float("nan"))
        model = imputer.fit(df)
        model.transform(df).show()

        ## fit过程一般我们认为是一个学习的过程,我们也可以吧这个过程保留下来
        ## 遗憾的是,我们暂时没有办法变更参数
        model.write().overwrite().save("/tmp/wow")
        model = ImputerModel.read().load("/tmp/wow")
        model.transform(df).show()
예제 #2
0
def _handle_missing(df):
    from pyspark.ml.feature import Imputer
    from pyspark.sql import functions as F

    # handle missing values
    columns = list(
        filter(lambda col: col not in ('class', 'weight', 'crime_pair'),
               df.columns))
    dtypes = dict(df.dtypes)

    # for int columns
    int_columns = list(
        filter(lambda col: dtypes[col] not in ('float', 'double'), columns))
    stats = df.agg(*(F.avg(c).alias(c) for c in int_columns))
    fillers = {
        k: round(v)
        for k, v in stats.first().asDict().items() if v is not None
    }
    df = df.na.fill(fillers)

    # for float columns
    float_columns = list(
        filter(lambda col: dtypes[col] in ('float', 'double'), columns))
    print(float_columns)
    imputer = Imputer(
        inputCols=float_columns,
        outputCols=["{}_imputed".format(c) for c in float_columns])
    df = imputer.fit(df).transform(df)
    df = df.drop(*float_columns)

    return df
예제 #3
0
def imputeMonthlyIncome(df):
    imputer = Imputer(inputCols=['MonthlyIncome'],
                      outputCols=['imputed_MonthlyIncome'],
                      strategy='median')

    # Columns are required to either double or float by the Imputer...
    df = df.withColumn(
        'double_MonthlyIncome',
        df.MonthlyIncome.cast(DoubleType())
    ).drop('MonthlyIncome') \
     .withColumnRenamed('double_MonthlyIncome', 'MonthlyIncome')

    df = imputer.fit(df).transform(df).drop('MonthlyIncome')

    df = df.withColumnRenamed('imputed_MonthlyIncome', 'MonthlyIncome')

    # Addressing MonthlyIncome of 0
    incomeMedian = np.median(df.select('MonthlyIncome').collect())

    # Apply income median if the MonthlyIncome is 0
    df = df.withColumn(
        'MonthlyIncome',
        F.when((F.col('MonthlyIncome') == 1),
               incomeMedian).otherwise(F.col('MonthlyIncome')))

    return df
예제 #4
0
    def _imputer_test_single(self):
        data = self.spark.createDataFrame([(1.0, float("nan")),
                                           (2.0, float("nan")),
                                           (float("nan"), 3.0), (4.0, 4.0),
                                           (5.0, 5.0)], ["a", "b"])
        imputer = Imputer(inputCols=["a"], outputCols=["out_a"])
        model = imputer.fit(data)

        # the input name should match the inputCols above
        model_onnx = convert_sparkml(model, 'Sparkml Imputer',
                                     [('a', FloatTensorType([None, 1]))])
        self.assertTrue(model_onnx is not None)

        # run the model
        predicted = model.transform(data)
        expected = predicted.select("out_a").toPandas().values.astype(
            numpy.float32)
        data_np = data.toPandas().a.values.astype(numpy.float32)
        data_np = data_np.reshape((-1, 1))
        paths = save_data_models(data_np,
                                 expected,
                                 model,
                                 model_onnx,
                                 basename="SparkmlImputerSingle")
        onnx_model_path = paths[-1]
        output, output_shapes = run_onnx_model(['out_a'], data_np,
                                               onnx_model_path)
        compare_results(expected, output, decimal=5)
예제 #5
0
def preprocess(df):
    cont_col = ['_c{0}'.format(i) for i in range(0, 14)]
    for i in cont_col:
        df = df.withColumn(i, df[i].cast("float"))
    # Continuous columns fill null with mean
    imputer=Imputer(inputCols=cont_col,outputCols=cont_col).setStrategy('mean')

    return imputer.fit(df).transform(df)
예제 #6
0
def imputaciones(VarLimpias):
    C = [i[0] for i in VarLimpias.dtypes if 'string' in i[1]]
    I = [i[0] for i in VarLimpias.dtypes if 'int' in i[1]]

    for f in I:
        VarLimpias = VarLimpias.withColumn(f, VarLimpias[f].cast(DoubleType()))
    imputer = Imputer(inputCols=[c for c in VarLimpias.columns if c not in C],
                      outputCols=[c for c in VarLimpias.columns if c not in C])
    Pba = imputer.fit(VarLimpias)
    return Pba.transform(VarLimpias)
예제 #7
0
    def imputation(self):
        C=[i[0] for i in self.data.dtypes if 'string' in i[1]]
        I=[i[0] for i in self.data.dtypes if 'int' in i[1]]

        for f in I: self.data = self.data.withColumn(f, self.data[f].cast(DoubleType()))
        imputer = Imputer(
    	    	      inputCols= [c for c in self.data.columns if c not in C],
    	              outputCols=[c for c in self.data.columns if c not in C])
        Pba=imputer.fit(self.data)
        return Pba.transform(self.data)
def imputer_continuous_features(df, data_types_map):
    continuous_features = list(
        set(data_types_map['DoubleType']) - set(['DEP_DEL15']))
    continuous_features_imputed = [
        var + "_imputed" for var in continuous_features
    ]
    imputer = Imputer(inputCols=continuous_features,
                      outputCols=continuous_features_imputed)
    tmp = imputer.fit(df).transform(df)
    get_missing_info(tmp)
    return [imputer]
예제 #9
0
    def replace_missings(self, test=False):
        """
        Replace missing values with a default value
        """

        for col in list(self.config_dict.keys()):
            # check if the replace missing transformation needs to be applied
            if self.config_dict[col]["replace_missings"]["apply"]:
                imputer = Imputer(
                    inputCols=[col],
                    outputCols=[
                        "{}_replace_missings".format(col)
                    ]).setMissingValue(
                        self.config_dict[col]["replace_missings"]["value"])
                if test:
                    self.test_data = imputer.fit(self.test_data).transform(
                        self.test_data)
                else:
                    self.train_data = imputer.fit(self.train_data).transform(
                        self.train_data)
def imputers(dataframe):
    inputCols = []
    outputCols = []
    for i in range(1,14):
        feature = 'I-'+str(i)
        dataframe =  dataframe.withColumn(feature, dataframe[feature].cast(DoubleType())) 
        inputCols.append(feature)
        outputCols.append(feature)
    imputer = Imputer(strategy="mean",
        inputCols=inputCols,
        outputCols=outputCols)
    return imputer.fit(dataframe).transform(dataframe)
예제 #11
0
    def fill_na_numerical(self,data,columns):
        '''
        FILL NULL VALUES FOR NUMERICAL DATA
        args:
        1.data: <SPARK DATAFRAME> actual spark dataframe
        2.columns: <LIST> of numerical columns we want to Impute

        return: <SPARK DATAFRAME>Imputed spark dataframe
        '''
        columns=list(columns)
        imputer=Imputer(inputCols=columns,outputCols=['imputed_'+str(col) for col in columns])
        dataCopy=imputer.fit(data).transform(data)
        return dataCopy    
    def fill_na_numerical(self, data, columns):
        '''
        Purpose: Fill null values for numerical data
        Inputs : Data(spark dataframe), column(numerical columns)
        Output : Imputed spark dataframe

        '''
        columns = list(columns)
        imputer = Imputer(
            inputCols=columns,
            outputCols=['imputed_' + str(col) for col in columns])
        dataCopy = imputer.fit(data).transform(data)
        return dataCopy
예제 #13
0
def imputer_usecase():
    """
        用于计算数据集中的缺失值,使用指定的策略进行数据填充,
        strategy指定数据填充策略,
    """
    spark = getSparkSession()
    df = spark.createDataFrame([(1.0, float("nan")), (2.0, float("nan")),
                                (float("nan"), 3.0), (4.0, 4.0), (5.0, 5.0)],
                               ["a", "b"])

    imputer = Imputer(inputCols=["a", "b"], outputCols=["out_a", "out_b"])
    model = imputer.fit(df)

    model.transform(df).show()
예제 #14
0
파일: app.py 프로젝트: mledl/BDMA_HW
def prepocess_data(df):
    # Preprocessing the data
    # Dimension reduction
    cols_reduce = [
        'Date', 'Time', 'Sub_metering_1', 'Sub_metering_2', 'Sub_metering_3'
    ]
    df = df.drop(*cols_reduce)

    # Fixing missing values (dataset uses ? as NaN for missing values)
    imputer = Imputer(inputCols=df.columns, outputCols=df.columns)
    imputer.setStrategy("mean")
    df = imputer.fit(df).transform(df)

    # Print the column name and datatype
    print(df.dtypes)
    return df
    def handle_missing(self, non_feature_col=["ID", "TIME_SPAN"]):
        import pyspark
        if type(self) == data_run_experiment:
            raise NotImplementedError(
                "Method need to be called in sub-class but currently called in base class"
            )

        try:
            ret_data_frame = self.spark.read.parquet(self.temp_missing_drop)
            self.logger.info(self.temp_missing_drop)
            return ret_data_frame
        except pyspark.sql.utils.AnalysisException as ex:
            template = "An exception of type {0} occurred. Arguments:\n{1!r}"
            message = template.format(type(ex).__name__, ex.args)
            self.logger.info(message)
            self.logger.info("PROCESS")

            #impute only. aggregation will be done after adding demographics
            cur_df = self.spark.read.parquet(self.out_file_name)
            cur_cols = cur_df.columns
            categorical_cols = list()
            numerical_cols = list()
            for i in non_feature_col:
                cur_cols.remove(i)
            for i in cur_cols:
                if i.find("C_") == 0:
                    categorical_cols.append(i)
                else:
                    numerical_cols.append(i)

            cur_df = cur_df.fillna(
                0, subset=categorical_cols).repartition(400).checkpoint()
            self.logger.info(cur_df.count())

            from pyspark.ml.feature import Imputer
            imputedCols = ["imp_{0}".format(x) for x in numerical_cols]
            imputer = Imputer(inputCols=numerical_cols,
                              outputCols=imputedCols).setStrategy("mean")
            imputer_model = imputer.fit(cur_df)
            ret_data_frame = imputer_model.transform(cur_df)
            ret_data_frame.select(non_feature_col + imputedCols +
                                  categorical_cols).show()
            ret_data_frame.select(non_feature_col + imputedCols +
                                  categorical_cols).write.save(
                                      self.temp_missing_drop)
            ret_data_frame = self.spark.read.parquet(self.temp_missing_drop)
            return ret_data_frame
def imputer_mean(df):

    weather_numeric_with_nulls = [
        'origin_WND_speed_rate', 'origin_CIG_ceiling_height',
        'origin_VIS_distance', 'origin_TMP_air_temperature',
        'origin_DEW_dew_point_temp', 'dest_WND_speed_rate',
        'dest_CIG_ceiling_height', 'dest_VIS_distance',
        'dest_TMP_air_temperature', 'dest_DEW_dew_point_temp',
        'origin_aa1_rain_depth', 'dest_aa1_rain_depth',
        'origin_aj1_snow_depth', 'dest_aj1_snow_depth'
    ]

    imputer = Imputer(inputCols=weather_numeric_with_nulls,
                      outputCols=weather_numeric_with_nulls)
    model = imputer.fit(filter_to_train(df))
    df = model.transform(df)

    return df
def imputeNumeric(numeric_DF):
    '''
    takes a spark df with continuous numeric columns
    outputs a spark df where all null values are replaced with the column average

    the first column, which is the outcome values, are preserved
    '''
    outputColumns=["{}".format(c) for c in numeric_DF.columns[1:11]]
    catColumns = ["{}".format(c) for c in numeric_DF.columns[11:]]

    imputer = Imputer(
        inputCols=numeric_DF.columns[1:11],
        outputCols=["{}".format(c) for c in numeric_DF.columns[1:11]]
    )

    model = imputer.fit(numeric_DF)

    imputedDF = model.transform(numeric_DF).select(['_1']+outputColumns+catColumns)

    return imputedDF
예제 #18
0
    def missing_val_imput(self):
        check = self.input.select(*(sum(col(c).isNull().cast("int")).alias(c)
                                    for c in self.input.columns))
        check.show()
        print("||| Above table shows missing values accross columns |||")
        check_pd = self.input.toPandas()
        val = check_pd.isnull().any().any()

        if val == True:
            imputer = Imputer(
                inputCols=self.input.columns,
                outputCols=["{}".format(c) for c in self.input.columns])
            cleaned_input = imputer.fit(self.input).transform(self.input)
            print("Missing values replaced with mean accross columns")
            print("Returning cleaned data")
            return cleaned_input

        else:
            print("No missing value found")
            return self.input
예제 #19
0
    def main(self, sc, *args):
        """ For each input files, i.e. train and test 'initiated, apply the same set of transformatons
        """

        sqlContext = SQLContext(sc)
        # For each key in the output dictionary of the Initiate task, i.e. train and test
        for inputFile in Initiate(self.input_file, self.output_path).output():
            df = sqlContext.read.csv(Initiate(
                self.input_file, self.output_path).output()[inputFile].path,
                                     sep=",",
                                     header=True,
                                     inferSchema=True)

            # Select final list of features
            list_features = ["Age", "Sex_indexed", "Fare", "Survived"]
            df = df.select(*list_features)

            # Replace missing values
            cols_missing = ["Age"]
            for col in cols_missing:
                imputer = Imputer(inputCols=[col],
                                  outputCols=[
                                      "{}_replace_missings".format(col)
                                  ]).setMissingValue(26.0)
                df = imputer.fit(df).transform(df)

            # Discretize
            cols_disc = {
                "Age_replace_missings":
                [-math.inf, 0.83, 21.0, 26.0, 33.0, 71.0, math.inf],
                "Fare": [-math.inf, 7.225, 8.122, 26.0, 83.475, math.inf],
            }
            for col in cols_disc:
                bucketizer = Bucketizer(splits=cols_disc[col],
                                        inputCol=col,
                                        outputCol="{}_discretized".format(col))
                df = bucketizer.transform(df)

            df.write.csv(self.output()[inputFile].path, header=True)
예제 #20
0
def fillMean(df):
    '''
    Fill cells with all Nans as mean
    Parameter:
        - dataframe
    Returns:
        - dataframe (after removing cols)
    '''
    cols = df.columns
    imputer = Imputer(
        strategy='median',
        inputCols=cols,
        # outputCols=['{}_clean'.format(c) for c in cols]
        outputCols=cols)

    df = imputer.fit(df).transform(df)
    # newCols = ['{}_clean'.format(c) for c in cols]
    # df = df[newCols]
    # df = df.rename(
    #     columns={
    #         name: name.split('_')[0] for name in newCols
    #     },
    # )
    return df
                                  outputCol="result")

result = discretizer.fit(df).transform(df)
result.show()

# COMMAND ----------

###Imputer fill the empty values in the dataframe either with mean or median
from pyspark.ml.feature import Imputer

df = spark.createDataFrame([(1.0, float("nan")), (2.0, float("nan")),
                            (float("nan"), 3.0), (4.0, 4.0), (5.0, 5.0)],
                           ["a", "b"])

imputer = Imputer(inputCols=["a", "b"], outputCols=["out_a", "out_b"])
model = imputer.fit(df)

model.transform(df).show()

# COMMAND ----------

###Feature slicer extract the original features from the feature vector
from pyspark.ml.feature import VectorSlicer
from pyspark.ml.linalg import Vectors
from pyspark.sql.types import Row

df = spark.createDataFrame([
    Row(userFeatures=Vectors.sparse(3, {
        0: -2.0,
        1: 2.3
    })),
예제 #22
0
def preprocess_data(initial_df):
    # Drop not allowed columns 
    banned_columns = ["ArrTime", "ActualElapsedTime", "AirTime", "TaxiIn", "Diverted", "CarrierDelay", "WeatherDelay", "NASDelay", "SecurityDelay", "LateAircraftDelay"]
    processed_df = initial_df.drop(*banned_columns)

    # Drop variables Cancelled and CancellationCode. Also drop cancelled rows
    processed_df = processed_df.withColumn("Cancelled", processed_df["Cancelled"].cast('boolean'))
    processed_df = processed_df.filter(processed_df.Cancelled == False)
    processed_df = processed_df.drop("CancellationCode", "Cancelled")

    # Drop variables TaxiOut, FlightNum, TailNum (categorical --> more info in plane-data.csv)
    processed_df = processed_df.drop("TaxiOut", "FlightNum", "TailNum")

    # Drop rows containing  na in DepTime, the rest will be erased when converted to int later 
    processed_df = processed_df.filter(processed_df["DepTime"] != "NA")

    # Fill "NA" values of the distance varible
    #   Split the dataframe so we can get a df with only NA values for variable distance
    df1 = processed_df.filter(processed_df.Distance == "NA").drop("Distance")    # nulls
    df2 = processed_df.filter(processed_df.Distance != "NA")    # not nulls

    #   Make a df with all the combinations of Orig, Dest, and distance
    location_distance_df = df2.select(df2.Origin, df2.Dest, df2.Distance).distinct()

    #   Join the missing values with the df of combinations or, dest and distance
    df1 = df1.join(location_distance_df, [df1.Origin == location_distance_df.Origin, df1.Dest == location_distance_df.Dest], how="left").drop(location_distance_df.Origin).drop(location_distance_df.Dest)

    #   Union the no "NA" part of the dataframe with the joined one
    processed_df = df1.union(df2)

    #   Mark those parts as deleteable
    df1.unpersist()
    df2.unpersist()
    location_distance_df.unpersist()

    #   Fill the rest with the avg
    processed_df = processed_df.withColumn("Distance", processed_df["Distance"].cast('int'))
    imputer = Imputer(strategy='mean', inputCol='Distance', outputCol='Distance')
    processed_df = imputer.fit(processed_df).transform(processed_df)

    # Clean the date and time variables and cast them to timestamps
    formatter = udf(hourFormatter)

    time_cols = ["DepTime", "CRSDepTime", "CRSArrTime"]
    for c in time_cols:
        processed_df = processed_df.withColumn(c, formatter(processed_df[c]))

    # Cast columns types to int
    for c in ['Year', 'Month', 'DayofMonth', 'DayOfWeek', 'DepTime', 'CRSDepTime', 'CRSArrTime', 'CRSElapsedTime', 'ArrDelay', 'DepDelay', 'Distance']:
        processed_df = processed_df.withColumn(c, processed_df[c].cast('int'))

    # Drop rows with CRSElapsedTime negative --> does not make sense to have a flight with negative duration
    processed_df = processed_df.filter(processed_df["CRSElapsedTime"] >= 0)
    #processed_df = processed_df.filter(processed_df["Distance"] > 0)

    # Drop all the rows containing null values (if there are any left)
    for c in ["CRSElapsedTime", "ArrDelay", "DepDelay", "Origin", "Dest"]:
        processed_df = processed_df.filter(processed_df[c].isNotNull())

    # Drop categorical variables UniqueCarrier (carriers.csv), Origin (airports.csv) and Dest (airports.csv) 
    processed_df = processed_df.drop("Origin", "Dest")

    processed_df = processed_df.select("Year","Month","DayofMonth","DayOfWeek","DepTime","CRSDepTime","CRSArrTime","CRSElapsedTime","ArrDelay","DepDelay","Distance","UniqueCarrier")

    return processed_df
예제 #23
0
    print("percentage of missing values in ", x, " is ", proportion, "%")
    if proportion < 30:
        contFilt.append(x)
    else:
        contDrop.append(x)

print("We have no nan or ?; only null")



from pyspark.ml.feature import Imputer

contImp =  ['Var1_imp','Var2_imp', 'Var3_imp','Var4_imp','Var5_imp', 'Var6_imp','Var7_imp','Var8_imp', 'NVVar1_imp','NVVar2_imp', 'NVVar3_imp', 'NVVar4_imp' ]
imputer = Imputer(strategy = 'mean', inputCols = cont, outputCols=contImp)

insurance_imp = imputer.fit(insuranceRaw).transform(insuranceRaw)

print("Impution COMPLETE \n\n\n")


for c in cont:
    insurance_imp = insurance_imp.drop(c)




insurance_imp.printSchema()

insurance_imp.show(20)

예제 #24
0
Run with:
  bin/spark-submit examples/src/main/python/ml/imputer_example.py
"""
# $example on$
from pyspark.ml.feature import Imputer
# $example off$
from pyspark.sql import SparkSession

if __name__ == "__main__":
    spark = SparkSession\
        .builder\
        .appName("ImputerExample")\
        .getOrCreate()

    # $example on$
    df = spark.createDataFrame([
        (1.0, float("nan")),
        (2.0, float("nan")),
        (float("nan"), 3.0),
        (4.0, 4.0),
        (5.0, 5.0)
    ], ["a", "b"])

    imputer = Imputer(inputCols=["a", "b"], outputCols=["out_a", "out_b"])
    model = imputer.fit(df)

    model.transform(df).show()
    # $example off$

    spark.stop()
예제 #25
0
                         sep="\t",
                         header=True,
                         inferSchema=True)
train, test = df.randomSplit([0.7, 0.3], seed=12345)

mapping = sqlContext.createDataFrame([(0, "male"), (1, "female")],
                                     ["id", "category"])

indexer = StringIndexer(inputCol="Sex", outputCol="SexIndex")
train = indexer.fit(train).transform(train)
train.show()

percentiles = train.approxQuantile("Fare", [0.01, 0.99], 0.01)

winsorize = expr("""IF(Fare >= {}, {},IF(Fare <= {},{},Fare))""".format(
    percentiles[0], percentiles[0], percentiles[1], percentiles[1]))

train.withColumn("Fare", winsorize)
train.show()

imputer = Imputer(inputCols=["Age", "Fare"],
                  outputCols=["out_Age", "out_Fare"]).setStrategy("median")
train = imputer.fit(train).transform(train)
train.show()

discretizer = QuantileDiscretizer(numBuckets=4,
                                  inputCol="out_Age",
                                  outputCol="out_Age_disc")
train = discretizer.fit(train).transform(train)
train.show()
예제 #26
0
                               outputCol="categoryIndexTicket")
model1 = stringIndexer1.fit(newdf)
stringIndexer2 = StringIndexer(inputCol="Cabintag",
                               outputCol="categoryIndexCabin")
model2 = stringIndexer2.fit(newdf)
stringIndexer3 = StringIndexer(inputCol="Nametag",
                               outputCol="categoryIndexName")
model3 = stringIndexer3.fit(newdf)
indexed = model0.transform(newdf)
indexed = model1.transform(indexed)
indexed = model2.transform(indexed)
newdf = model3.transform(indexed)

newdf = newdf.withColumn("Age", newdf["Age"].cast(DoubleType()))
imputer = Imputer(inputCols=["Age"], outputCols=["out_Age"], strategy='median')
newdf = imputer.fit(newdf).transform(newdf)
QuantileDiscreete1 = QuantileDiscretizer(numBuckets=5,
                                         inputCol="out_Age",
                                         outputCol="agebucket")
newdf = QuantileDiscreete1.fit(newdf).transform(newdf)

stringIndexer4 = StringIndexer(inputCol="out_Age", outputCol="categoryAge")
model4 = stringIndexer4.fit(newdf)
newdf = model4.transform(newdf)

newdf = newdf.withColumn("Fare", newdf["Fare"].cast(DoubleType()))
imputer2 = Imputer(inputCols=["Fare"],
                   outputCols=["out_Fare"],
                   strategy='median')
newdf = imputer2.fit(newdf).transform(newdf)
QuantileDiscreete2 = QuantileDiscretizer(numBuckets=4,
예제 #27
0
def impute(clickDF  , numericColumnNames=[]):
    outputColumNames = [columnName+"_out_imputer" for columnName in numericColumnNames] 
    #imputer =  Imputer().setInputCols(numericColumnNames).setOutputCols(outputColumNames)  
    imputer =  Imputer().setInputCols(numericColumnNames).setOutputCols(outputColumNames).setMissingValue(0)
    return imputer.fit(clickDF).transform(clickDF) , outputColumNames;
    passengers = readPassengersWithCastingToDoubles(spark).select(
        "survived", "pclass", "sibsp", "parch")

    # Step - 1: Define strategy and new column names for Imputer transformation
    imputer = Imputer(
        strategy="mean",
        inputCols=["pclass", "sibsp", "parch"],
        outputCols=["pclass_imputed", "sibsp_imputed", "parch_imputed"])

    # Step - 2: Make Vectors from dataframe's columns using special Vector Assmebler
    assembler = VectorAssembler(
        inputCols=["pclass_imputed", "sibsp_imputed", "parch_imputed"],
        outputCol="features")

    # Step - 3: Transform the dataset with the Imputer
    passengersWithFilledEmptyValues = imputer.fit(passengers).transform(
        passengers)
    passengersWithFilledEmptyValues.show()  # look at first row

    #  Step - 4: Transform dataframe to vectorized dataframe
    output = assembler.transform(passengersWithFilledEmptyValues).select(
        "features", "survived"
    )  # <============== drop row if it has nulls/NaNs in the next list of columns)
    output.show()

    # Step - 5: Set up the Decision Tree Classifier
    trainer = DecisionTreeClassifier(labelCol="survived",
                                     featuresCol="features")

    # Step - 6: Train the model
    model = trainer.fit(output)
numeric_cols_impute = [c for c in columns_less_40 if c[0] == 'i']
cate_cols_impute = [c for c in columns_less_40 if c[0] != 'i']

# categorical columns: add class missing?
for c in cate_cols_impute:
    criteoDF = criteoDF.withColumn(
        c,
        when(col(c).isNull(), 'missing').otherwise(criteoDF[c]))

# impute numerical columns with mean
numeric_imputer = Imputer(
    inputCols=numeric_cols_impute,
    outputCols=["{}_imputed".format(c) for c in numeric_cols_impute],
    strategy='mean')

criteoDF = numeric_imputer.fit(criteoDF).transform(criteoDF)
criteoDF = criteoDF.drop(*numeric_cols_impute)

categorical_cols = [
    'c_' + str(i + 1) for i in range(26)
    if ('c_' + str(i + 1) not in columns_70) and ('c_' +
                                                  str(i + 1) not in columns_40)
]
balanced_count = float(criteoDF.count())

one_hot_cols = []
max_distinct = 0
for k in categorical_cols:
    # now, let's print out the distinct value percentage
    count = criteoDF.select(k).distinct().count()
    if count <= 20:
예제 #30
0
    OneHotEncoder(inputCol=c + "_string_encoded", outputCol=c + "_one_hot")
    for c in strings_used
]

ppl = Pipeline(stages=stage_string + stage_one_hot)
df = ppl.fit(df).transform(df)

# NUMERIC
numericals = data_types["DoubleType"]
numericals = [var for var in numericals]
numericals_imputed = [var + "_imputed" for var in numericals]

from pyspark.ml.feature import Imputer

imputer = Imputer(inputCols=numericals, outputCols=numericals_imputed)
df = imputer.fit(df).transform(df)

# cast integer to double and impute
for c in data_types["IntegerType"]:
    df = df.withColumn(c + "_cast_to_double", df[c].cast("double"))

cast_vars = [var for var in df.columns if var.endswith("_cast_to_double")]
cast_vars_imputed = [var + "imputed" for var in cast_vars]

imputer_for_cast_vars = Imputer(inputCols=cast_vars,
                                outputCols=cast_vars_imputed)
df = imputer_for_cast_vars.fit(df).transform(df)

# vector assembly
from pyspark.ml.feature import VectorAssembler
예제 #31
0
data = data.select(
    ['Survived', 'Pclass', 'Gender', 'Age', 'SibSp', 'Parch', 'Fare'])
'''
Calculating Age Missing Values
Age is an important feature; it is not wise to drop it because of some missing values. What we could do is to 
fill missing values with the help of existing ones. This process is called Data Imputation. There are many
 available strategies, but we will follow a simple one that fills missing values with the mean value calculated 
 from the sample.Spark ML makes the job easy using the Imputer class. First, we define the estimator, fit it 
 to the model, then we apply the transformer on the data.
'''

from pyspark.ml.feature import Imputer
imputer = Imputer(strategy='mean',
                  inputCols=['Age'],
                  outputCols=['AgeImputed'])
imputer_model = imputer.fit(data)
data = imputer_model.transform(data)

from pyspark.ml.feature import StringIndexer
gender_indexer = StringIndexer(inputCol='Gender', outputCol='GenderIndexed')
gender_indexer_model = gender_indexer.fit(data)
data = gender_indexer_model.transform(data)
'''
Creating the Features VectorWe learned previously that Spark ML expects data to be represented in two
 columns: a features vector and a label column. We have the label column ready (Survived), 
so let us prepare the features vector.
'''

from pyspark.ml.feature import VectorAssembler
assembler = VectorAssembler(inputCols=[
    'Pclass', 'SibSp', 'Parch', 'Fare', 'AgeImputed', 'GenderIndexed'