예제 #1
def get_sdummies(sdf, dummy_columns, keep_top, replace_with='other'):
    """    Index string columns and group all observations that occur in less then a keep_top% of the rows in sdf per column.
    :param sdf: A pyspark.sql.dataframe.DataFrame
    :param dummy_columns: String columns that need to be indexed
    :param keep_top: List [1, 0.8, 0.8]
    :param replace_with: String to use as replacement for the observations that need to be grouped.
    total = sdf.count()
    column_i = 0
    for string_col in dummy_columns:

        # Descending sorting with counts
        sdf_column_count = sdf.groupBy(string_col).count().orderBy(
            'count', ascending=False)
        sdf_column_count = sdf_column_count.withColumn(
                -sys.maxsize, 0)))

        # Obtain top dummy factors
        sdf_column_top_dummies = sdf_column_count.withColumn(
            "cumperc", sdf_column_count['cumsum'] /
            total).filter(col('cumperc') <= keep_top[column_i])
        keep_list = sdf_column_top_dummies.select(string_col).rdd.flatMap(
            lambda x: x).collect()
        sdf = sdf.withColumn(

        # Apply string indexer
        pipeline = Pipeline(stages=[
            StringIndexer(inputCol=string_col, outputCol="IDX_" + string_col)
        sdf = pipeline.fit(sdf).transform(sdf)

        encoder = OneHotEncoder(inputCol="IDX_" + string_col,
                                outputCol="ONEHOT_" + string_col)
            True)  # only keep 2^n-n dummies to keep dummy independent.
        sdf = encoder.transform(sdf)

        column_i += 1

    ## Drop intermediate columns
    drop_columns = ["IDX_" + x for x in dummy_columns]  # +  dummy_columns
    sdf = sdf.drop(*drop_columns)

    return sdf
예제 #2
        column_i += 1
    return sdf

        # Apply string indexer
        pipeline = Pipeline(stages=[
            StringIndexer(inputCol=string_col, outputCol="IDX_" + string_col)
        sdf = pipeline.fit(sdf).transform(sdf)

        encoder = OneHotEncoder(inputCol="IDX_" + string_col,
                                outputCol="ONEHOT_" + string_col)
        encoder.setDropLast(True)  # only keep 2^n-n dummies to keep dummy independent.
        sdf = encoder.transform(sdf)

        column_i += 1

    ## Drop intermediate columns
    drop_columns = ["IDX_" +x for x in dummy_columns] # +  dummy_columns
    sdf = sdf.drop(*drop_columns)

    return sdf

schema_sdf = StructType([
        StructField('Year', IntegerType(), True),
        StructField('Month', IntegerType(), True),
        StructField('DayofMonth', IntegerType(), True),
예제 #3
def get_sdummies(sdf, dummy_columns, keep_top, replace_with='zzz_other', dummy_info=[]):
    """Index string columns and group all observations that occur in less then a keep_top% of the rows in sdf per column.

    :param sdf: A pyspark.sql.dataframe.DataFrame
    :param dummy_columns: String columns that need to be indexed
    :param keep_top: List [1, 0.8, 0.8]
    :param replace_with: String to use as replacement for the observations that need to be grouped.

    return sdf, dummy_info
    total = sdf.count()
    column_i = 0

    factor_set = {}  # The full dummy sets
    factor_selected = {}  # Used dummy sets
    factor_dropped = {}  # Dropped dummy sets
    factor_selected_names = {}  # Final revised factors

    for string_col in dummy_columns:

        if len(dummy_info) == 0:
            # Descending sorting with counts
            sdf_column_count = sdf.groupBy(string_col).count().orderBy(
                'count', ascending=False)
            sdf_column_count = sdf_column_count.withColumn(
                    -sys.maxsize, 0)))

            # Obtain top dummy factors
            sdf_column_top_dummies = sdf_column_count.withColumn(
                "cumperc", sdf_column_count['cumsum'] / total).filter(
                    col('cumperc') <= keep_top[column_i])
            keep_list = sdf_column_top_dummies.select(string_col).rdd.flatMap(
                lambda x: x).collect()

            # Save factor sets
            factor_set[string_col] = sdf_column_count.select(string_col).rdd.flatMap(
                lambda x: x).collect()
            factor_selected[string_col] = keep_list
            factor_dropped[string_col] = list(set(factor_set[string_col]) - set(keep_list))
            # factor_selected_names[string_col] = [string_col + '_' + str(x) for x in factor_new ]

            keep_list = dummy_info["factor_selected"][string_col]

        # Replace dropped dummy factors with grouped factors.
        sdf = sdf.withColumn(

        # Apply string indexer
        pipeline = Pipeline(stages=[
            StringIndexer(inputCol=string_col, outputCol="IDX_" + string_col)
        sdf = pipeline.fit(sdf).transform(sdf)

        encoder = OneHotEncoder(inputCol="IDX_" + string_col,
                                outputCol="ONEHOT_" + string_col)
        encoder.setDropLast(True)  # only keep 2^n-n dummies to keep dummy independent.
        sdf = encoder.transform(sdf)

        column_i += 1

    # Drop intermediate columns
    drop_columns = ["IDX_" + x for x in dummy_columns]  # +  dummy_columns
    sdf = sdf.drop(*drop_columns)

    if len(dummy_info) == 0:
        dummy_info = {
            'factor_set': factor_set,
            'factor_selected': factor_selected,
            'factor_dropped': factor_dropped,
            'factor_selected_names': factor_selected_names

    return sdf, dummy_info
예제 #4
# transform the two categorical feature
from pyspark.ml.feature import StringIndexer

stringIndexer = StringIndexer(inputCol="Soil_Type", outputCol="Soil_Index")
model1 = stringIndexer.fit(covertype_df)
indexedDF = model1.transform(covertype_df)

stringIndexer2 = StringIndexer(inputCol="Wild_Type", outputCol="Wild_Index")
model2 = stringIndexer2.fit(indexedDF)
indexedDF2 = model2.transform(indexedDF)

from pyspark.ml.feature import OneHotEncoder

encoder = OneHotEncoder(inputCol="Soil_Index", outputCol="SoilEncoder")
encodedDF = encoder.transform(indexedDF2)

encoder2 = OneHotEncoder(inputCol="Wild_Index", outputCol="WildEncoder")
encodedDF2 = encoder2.transform(encodedDF)

#Use the VectorAssembler technique to accumulate all features into one vector.
from pyspark.ml.feature import VectorAssembler

vector_assembler = VectorAssembler(
        'SoilEncoder',  # feature name of Soil type encoded
        'WildEncoder',  # feature name of Wild type encoded