예제 #1
0
def oneHotEncodeColumns(df, cols):
    from pyspark.ml.feature import OneHotEncoder
    newdf = df
    for c in cols:
        onehotenc = OneHotEncoder(inputCol=c, outputCol=c+"-onehot", dropLast=False)
        newdf = onehotenc.transform(newdf).drop(c)
        newdf = newdf.withColumnRenamed(c+"-onehot", c)
    return newdf
def events(df,column_name):
    i = column_name+"I"
    v = column_name+"V"
    stringIndexer = StringIndexer(inputCol=column_name, outputCol=i)
    model = stringIndexer.fit(df)
    indexed = model.transform(df)
    encoder = OneHotEncoder(inputCol=i, outputCol=v)
    encoded = encoder.transform(indexed)
    return encoded
예제 #3
0
    def oneHotEncoding(self, df, input_col):
        stringInd = StringIndexer(inputCol=input_col, outputCol="indexed")
        model = stringInd.fit(df)
        td = model.transform(df)
        encoder = OneHotEncoder(inputCol="indexed", outputCol="features", dropLast=False)
        final_encoding = encoder.transform(td).select(df.id, 'features').cache()
        
        conv_udf = udf(lambda line: Vectors.dense(line).tolist())
        final_encoding = final_encoding.select(df.id,conv_udf(final_encoding.features).alias("num_"+input_col)).cache()

        return final_encoding
예제 #4
0
        .getOrCreate()

#load data in csv format with header
rawData = spark.read.load("./hour.csv",format="csv",header=True)
rawData.count()#17379
data=rawData
#casual+registered=cnt
rawData=rawData.drop("casual","registered")#drop columns
rawData=rawData.withColumnRenamed("cnt","label")#rename columns
cat_features=rawData.columns[2:10]

for col in cat_features:
    #must give a new column name
    indexer = StringIndexer(inputCol=col, outputCol=col+"_indexed",handleInvalid='error')
    indexed = indexer.fit(rawData).transform(rawData)
    encoder = OneHotEncoder(inputCol=col+"_indexed", outputCol=col+"Vec")
    rawData = encoder.transform(indexed)


#cast columns to float
for col in rawData.columns[2:15]:
    rawData=rawData.withColumn(col,rawData[col].cast(FloatType()))

#convert date to date format and extract week day
from pyspark.sql.functions import date_format
rawData=rawData.withColumn("dteday",rawData["dteday"].cast(DateType()))
rawData=rawData.withColumn('dteday', date_format('dteday', 'u'))

isweekend=udf(lambda x:1.0 if int(x) > 5 else 0.0,FloatType())
rawData=rawData.withColumn("isWeekend",isweekend("dteday"))#whether it is weekend
rawData=rawData.drop("dteday")
예제 #5
0
c16I = StringIndexer(inputCol="C16", outputCol="iC16", handleInvalid="skip")
c18I = StringIndexer(inputCol="C18", outputCol="iC18", handleInvalid="skip")
c19I = StringIndexer(inputCol="C19", outputCol="iC19", handleInvalid="skip")
c21I = StringIndexer(inputCol="C21", outputCol="iC21", handleInvalid="skip")
appcatI = StringIndexer(inputCol="app_category",
                        outputCol="i_app_category",
                        handleInvalid="skip")
devtypeI = StringIndexer(inputCol="device_type",
                         outputCol="i_device_type",
                         handleInvalid="skip")
sitecatI = StringIndexer(inputCol="site_category",
                         outputCol="i_site_category",
                         handleInvalid="skip")

#OneHotEncoder applied after the stringIndexer to form binary vector for each column
c1E = OneHotEncoder(inputCol="iC1", outputCol="C1Vector")
c15E = OneHotEncoder(inputCol="iC15", outputCol="C15Vector")
c16E = OneHotEncoder(inputCol="iC16", outputCol="C16Vector")
c18E = OneHotEncoder(inputCol="iC18", outputCol="C18Vector")
c19E = OneHotEncoder(inputCol="iC19", outputCol="C19Vector")
c21E = OneHotEncoder(inputCol="iC21", outputCol="C21Vector")
appcatE = OneHotEncoder(inputCol="i_app_category",
                        outputCol="i_app_category_Vector")
devtypeE = OneHotEncoder(inputCol="i_device_type",
                         outputCol="i_device_type_Vector")
sitecatE = OneHotEncoder(inputCol="i_site_category",
                         outputCol="i_site_category_Vector")

#Vector assembler
fAssembler = VectorAssembler(inputCols=[
    "C1Vector", "C15Vector", "C16Vector", "C18Vector", "C19Vector",
예제 #6
0
cols = [
    'Survived', 'Pclass', 'Sex', 'Age', 'SibSp', 'Parch', 'Fare', 'Embarked'
]

data_cols = data.select(cols)
data_cols.show()
final_data = data_cols.na.drop()

# Transform the categorical columns into numbers
gender_indexer = StringIndexer(inputCol='Sex', outputCol='SexIndex')

# A B C
# 0 1 2
# One hot encode ----> this is mapping everyting into [1, 0, 0] [0, 1, 0] etc.
gender_encoder = OneHotEncoder(
    inputCol='SexIndex', outputCol='SexVec'
)  # ---> each entry will be converted to a vector A = [1, 0] B = [0, 1]

embark_indexer = StringIndexer(inputCol='Embarked', outputCol='EmbarkedIndex')
embark_encoder = OneHotEncoder(
    inputCol='EmbarkedIndex', outputCol='EmbarkedVec'
)  # ---> each entry will be converted to a vector A = [1, 0] B = [0, 1]

new_cols = ['Pclass', 'SexVec', 'Age', 'SibSp', 'Parch', 'Fare', 'EmbarkedVec']
assembler = VectorAssembler(inputCols=new_cols, outputCol='features')

logreg_titanic = LogisticRegression(featuresCol='features',
                                    labelCol='Survived')

pipeline = Pipeline(stages=[
    gender_indexer, embark_indexer, gender_encoder, embark_encoder, assembler,
예제 #7
0
data = data.filter(lambda row: row != header)
schema = data.map(lambda x: Row(id=x[0], make=x[1], vdps=x[2], label=x[3]))
df = sqlContext.createDataFrame(schema)

# string indexer for our categorical features
# this indexes each categorical feature and we will
# save them in a data frame that maps the make name to the string
# for persistence purposes
indexer = StringIndexer(inputCol="make", outputCol="makeIDX")
df = indexer.fit(df).transform(df)
make_idx_mappings = df.select('make', 'makeIDX').distinct().show()

# one hot encoder
# this will convert the indexed strings to sparse one hot vectors
# think of this as dummy feature creation
encoder = OneHotEncoder(inputCol="makeIDX", outputCol="make_sparse_vect")
df = encoder.transform(df)

# spark models expect to see a feature vector and a prediction column
# so we need to put all our features into a vector, in this case
# the sparse vector and vdp count, we also have to do some
# data type transformations from string to double
df = df.withColumn("vdp_int", df["vdps"].cast("double"))
df = df.withColumn("label_int", df["label"].cast("double"))
assembler = VectorAssembler(inputCols=["make_sparse_vect", "vdp_int"],
                            outputCol='features')
df = assembler.transform(df)

# make the model
# the step size and iterations is touchy so results might be funky
gbt = GBTRegressor(maxIter=100,
예제 #8
0
mmi_value_0_node4 = ["Sex", "Embarked", "Survived"]
mmi_value_1_node4 = ["indexedSex", "indexedEmbarked", "indexedSurvived"]
stages_node4 = []
for i in range(len(mmi_value_0_node4)):
    stages_node4.append(
        StringIndexer(inputCol=mmi_value_0_node4[i],
                      outputCol=mmi_value_1_node4[i],
                      handleInvalid="error",
                      stringOrderType="frequencyDesc"))

mmi_value_0_node5 = ["indexedSex", "indexedEmbarked"]
mmi_value_1_node5 = ['sexVec', 'embarkedVec']
stages_node5 = []
for i in range(len(mmi_value_0_node5)):
    stages_node5.append(
        OneHotEncoder(inputCol=mmi_value_0_node5[i],
                      outputCol=mmi_value_1_node5[i]))

pipeline_stage_node6 = VectorAssembler(
    outputCol="features",
    inputCols=["Pclass", "sexVec", "Age", "SibSp", "Fare", "embarkedVec"])
pipeline_stage_node7 = RandomForestClassifier(featureSubsetStrategy="auto",
                                              numTrees=20,
                                              maxDepth=5,
                                              predictionCol="prediction",
                                              rawPredictionCol="rawPrediction",
                                              probabilityCol="probability",
                                              labelCol="indexedSurvived",
                                              featuresCol="features",
                                              impurity="gini")

stages_node8 = [
예제 #9
0


# create a new "carrier_indexed" column
(indexed_df.select(["origin", "dest", "carrier", "carrier_indexed"]).sample(fraction=0.001, withReplacement=False, seed=rnd_seed).show())



# check the encoded carrier values
carrierIndexer.labels


# check the carrier code and index mapping
indexed_df.select(["carrier", "carrier_indexed"]).distinct().show()

carrierEncoder = OneHotEncoder(inputCol="{0}_indexed".format(colName), outputCol="{0}_encoded".format(colName))
encoded_df = carrierEncoder.transform(indexed_df)



(encoded_df.select(["origin", "dest", "carrier", "carrier_indexed", "carrier_encoded"]).sample(fraction=0.001, withReplacement=False, seed=rnd_seed).show())


carrierEncoder = OneHotEncoder(inputCol="{0}_indexed".format(colName), outputCol="{0}_encoded".format(colName), dropLast=False)
encoded_df = carrierEncoder.transform(indexed_df)

(encoded_df.select(["carrier", "carrier_indexed", "carrier_encoded", "dist"]).sample(fraction=0.001, withReplacement=False, seed=rnd_seed).show())

#Combine StringIndexer, OneHotEncoder, VectorAssembler and a Transformer to put features into a feature vector column

        for col in ordinals
    ],
    VectorAssembler(inputCols=ordinals_input, outputCol='ordinals_vector'),
    StandardScaler(inputCol='ordinals_vector',
                   outputCol='ordinals_std',
                   withStd=True,
                   withMean=True),

    # categoricals
    *[
        StringIndexer(
            inputCol=col, outputCol=col + "_index", handleInvalid='keep')
        for col in categoricals
    ],
    *[
        OneHotEncoder(
            inputCol=col + "_index", outputCol=col + "_encode", dropLast=True)
        for col in categoricals
    ],
    VectorAssembler(inputCols=categoricals_input,
                    outputCol='categoricals_vector'),
    StandardScaler(inputCol='categoricals_vector',
                   outputCol='categoricals_std',
                   withStd=True,
                   withMean=True),

    # final assembler
    VectorAssembler(inputCols=stdFeatures, outputCol='features_std'),

    #PCA
    PCA(k=75, inputCol='features_std', outputCol='features_final')
]
예제 #11
0
total = train.union(val).union(test)

# create features
indexer = StringIndexer(inputCol="_c12", outputCol="c22")
indexer = indexer.fit(total)
train = indexer.transform(train)
val = indexer.transform(val)
test = indexer.transform(test)
# create label
indexer = StringIndexer(inputCol="_c11", outputCol="label")
indexer = indexer.fit(total)
train = indexer.transform(train)
val = indexer.transform(val)
test = indexer.transform(test)
# One-hot encoder
encoder = OneHotEncoder(inputCol="c22", outputCol="c2")
train = encoder.transform(train)
val = encoder.transform(val)
test = encoder.transform(test)
# create the trainer and set its parameters
with open('H1_15300180012_output.txt', 'a') as f:
    f.write('\n \n')
    f.write('jq_H1_15300180012_output_naive_bayes\n')

para = 1.0
with open('H1_15300180012_output.txt', 'a') as f:
    f.write('Smoothing parameter: {} \n'.format(para))
nb = NaiveBayes(smoothing=para, modelType="multinomial", labelCol="label", featuresCol="c2")

# train the model
model = nb.fit(train)
df = spark.read.csv('home_data.csv', header=True)
df = df.withColumn("price", df["price"].cast(DoubleType()))\
                   .withColumn("sqft_living", df["sqft_living"].cast(DoubleType()))
                
print(df.columns)

# Get training sets

(trainData, testData) = df.randomSplit(seed=123, weights=[0.7,0.3])
print("The total data is {}, the training is {} and the test is {}"\
      .format(df.count(), trainData.count(), testData.count()))

#Train & Evaluate

stringifier = StringIndexer(inputCol="zipcode", outputCol="zipIndex")
oneHotter = OneHotEncoder(inputCol="zipIndex", outputCol="zipVector")
vectorizer = VectorAssembler(inputCols=["sqft_living", "zipVector"], outputCol="features")
glr = GeneralizedLinearRegression(labelCol="price", family="gaussian", link="identity", maxIter=10, regParam=0.3)
rf = RandomForestRegressor(labelCol="price", seed=1234)
rfAdv = RandomForestRegressor(labelCol="price", seed=1234, numTrees=500, maxDepth=10, maxBins=100, minInstancesPerNode=5, featureSubsetStrategy="all")
for alg in [(glr, "Linear Regression"), (rf, "Random Forest (Default)"), (rfAdv, "Random Forest (Advanced)")]:
    print("+++++%s Results+++++" % (alg[1]))
    simplePipeline = Pipeline(stages=[stringifier, oneHotter, vectorizer, alg[0]])
    model = simplePipeline.fit(trainData)

    #Print Reslts

    #testingData = vectorizer.transform(testData)
    # Make predictions.
    predictions = model.transform(testData)
예제 #13
0
    store = Store.create(args.work_dir)

    # Download MNIST dataset
    data_url = 'https://www.csie.ntu.edu.tw/~cjlin/libsvmtools/datasets/multiclass/mnist.bz2'
    libsvm_path = os.path.join(args.data_dir, 'mnist.bz2')
    if not os.path.exists(libsvm_path):
        subprocess.check_output(['wget', data_url, '-O', libsvm_path])

    # Load dataset into a Spark DataFrame
    df = spark.read.format('libsvm') \
        .option('numFeatures', '784') \
        .load(libsvm_path)

    # One-hot encode labels into SparseVectors
    encoder = OneHotEncoder(inputCols=['label'],
                            outputCols=['label_vec'],
                            dropLast=False)
    model = encoder.fit(df)
    train_df = model.transform(df)

    # Train/test split
    train_df, test_df = train_df.randomSplit([0.9, 0.1])

    # Disable GPUs when building the model to prevent memory leaks
    if LooseVersion(tf.__version__) >= LooseVersion('2.0.0'):
        # See https://github.com/tensorflow/tensorflow/issues/33168
        os.environ['CUDA_VISIBLE_DEVICES'] = '-1'
    else:
        keras.backend.set_session(
            tf.Session(config=tf.ConfigProto(device_count={'GPU': 0})))
예제 #14
0
 def one_hot_encoder(self, input_cols):
     output_cols = [each_col + "_vec" for each_col in input_cols]
     return OneHotEncoder(inputCols=input_cols,
                          outputCols=output_cols), output_cols
# one-hot encoding
#==============================================================================

from pyspark.ml.feature import OneHotEncoder

# build indexer

categorical_columns = [
    'term', 'emp_length', 'home_ownership', 'purpose', 'state'
]
stringindexer_stages = [
    StringIndexer(inputCol=c, outputCol='stringindexed_' + c)
    for c in categorical_columns
]
onehotencoder_stages = [
    OneHotEncoder(inputCol='stringindexed_' + c,
                  outputCol='onehotencoded_' + c) for c in categorical_columns
]
all_stages_transf1 = stringindexer_stages + onehotencoder_stages

## build pipeline model for transformation of Data
from pyspark.ml import Pipeline
pipeline = Pipeline(stages=all_stages_transf1)

## fit pipeline model
pipeline_mode = pipeline.fit(loan_df_droped)

## transform data
df_coded = pipeline_mode.transform(loan_df_droped)

## remove uncoded columns
selected_columns = ['onehotencoded_' + c for c in categorical_columns] + [
예제 #16
0
# | 24|     M|  1|   technician|   85711|
# | 53|     F|  2|        other|   94043|
# | 23|     M|  3|       writer|   32067|
# | 24|     M|  4|   technician|   43537|
# | 33|     F|  5|        other|   15213|
# | 42|     M|  6|    executive|   98101|
# | 57|     M|  7|administrator|   91344|
# | 36|     M|  8|administrator|   05201|
# | 29|     M|  9|      student|   01002|
# | 53|     M| 10|       lawyer|   90703|
# | 39|     F| 11|        other|   30329|
indexer = StringIndexer(inputCol="occupations", outputCol="occupationsIndex",handleInvalid='error')
indexed=indexer.fit(users).transform(users)
#transfer dataframe to rdd by ".rdd"
all_occupations = set(indexed.select("occupations","occupationsIndex").rdd.map(lambda x:(x[0],x[1])).collect())
encoder = OneHotEncoder(inputCol="occupationsIndex", outputCol="occupationsVec")
encoded = encoder.transform(indexed)
encoded.select("occupations","occupationsVec").show()
# +-------------+---------------+
# |  occupations| occupationsVec|
# +-------------+---------------+
# |   technician|(20,[11],[1.0])|
# |        other| (20,[1],[1.0])|
# |       writer| (20,[7],[1.0])|
# |   technician|(20,[11],[1.0])|
# |        other| (20,[1],[1.0])|
# |    executive| (20,[8],[1.0])|
# |administrator| (20,[3],[1.0])|
# |administrator| (20,[3],[1.0])|
# |      student| (20,[0],[1.0])|
예제 #17
0
# $example on$
from pyspark.ml.feature import OneHotEncoder
# $example off$
from pyspark.sql import SparkSession

if __name__ == "__main__":
    spark = SparkSession\
        .builder\
        .appName("OneHotEncoderExample")\
        .getOrCreate()

    # Note: categorical features are usually first encoded with StringIndexer
    # $example on$
    df = spark.createDataFrame([
        (0.0, 1.0),
        (1.0, 0.0),
        (2.0, 1.0),
        (0.0, 2.0),
        (0.0, 1.0),
        (2.0, 0.0)
    ], ["categoryIndex1", "categoryIndex2"])

    encoder = OneHotEncoder(inputCols=["categoryIndex1", "categoryIndex2"],
                            outputCols=["categoryVec1", "categoryVec2"])
    model = encoder.fit(df)
    encoded = model.transform(df)
    encoded.show()
    # $example off$

    spark.stop()
예제 #18
0
# test = test.drop("AGEImputed")
# train = train.drop("AGEImputed")

test.columns, train.columns

print("Number of training records: " + str(train.count()))
print("Number of testing records : " + str(test.count()))

# one-hot encoding  'pclass', 'embarked', 'sex'
from pyspark.ml.feature import OneHotEncoder, StringIndexer, IndexToString, VectorAssembler
from pyspark.ml.regression import RandomForestRegressor
from pyspark.ml.evaluation import MulticlassClassificationEvaluator
from pyspark.ml import Pipeline, Model

x = StringIndexer(inputCol = "pclass", outputCol = "pclass_1").fit(train).transform(train)
OneHotEncoder(inputCol = "pclass_1", outputCol = "pclass_2").transform(x).show(10)   # demonstrating the work of OneHotEncoder with StringIndexer

si_pclass = StringIndexer(inputCol = "pclass", outputCol = "pclass_1")
si_embarked = StringIndexer(inputCol = "embarked", outputCol = "embarked_1")
si_survived = StringIndexer(inputCol = "survived", outputCol = "survived_1")
si_sex = StringIndexer(inputCol = "sex", outputCol = "sex_1")

vectorAssembler_features = VectorAssembler(inputCols = ["pclass_1", "embarked_1", "sex_1", "survived_1"], outputCol = "features")

rf = RandomForestRegressor(labelCol = "age", featuresCol = "features")

pipeline_rf = Pipeline(stages = [si_pclass, si_embarked, si_survived, si_sex, vectorAssembler_features, rf])

train.printSchema()

pipeline_rf.fit(train)
		 False if r.attributes['Good For'] is None else r.attributes['Good For']['breakfast'],
		 False if r.attributes['Ambience'] is None else r.attributes['Ambience']['romantic'],
		 False if r.attributes['Ambience'] is None else r.attributes['Ambience']['upscale'],
		 False if r.attributes['Ambience'] is None else r.attributes['Ambience']['casual'],
		 False if (r.attributes['Alcohol'] is None or r.attributes['Alcohol'] == 'none') else True,
		 False if r.attributes['Take-out'] is None else r.attributes['Take-out']]
	).toDF(clustering_columns)

# drop row with null values
lv_clustering_data = lv_clustering_data.dropna()

#Neighborhood feature engineering
stringIndexer = StringIndexer(inputCol="neighborhood", outputCol="neigh_index")
lv_model = stringIndexer.fit(lv_clustering_data)
lv_indexed = lv_model.transform(lv_clustering_data)
encoder = OneHotEncoder(dropLast=False, inputCol="neigh_index", outputCol="neigh_vec")
lv_encoded = encoder.transform(lv_indexed)

#initial feature set
# assembler = VectorAssembler(
#     inputCols=["stars", "price_range", "neigh_vec"],
#     outputCol="features_vec")

#expanded feature set
feature_columns = clustering_columns[2:]
feature_columns.append("neigh_vec")
assembler = VectorAssembler(
    inputCols=feature_columns,
    outputCol="features_vec")

lv_assembled = assembler.transform(lv_encoded)
예제 #20
0
# different brands of chess sets, and use the column named
# `set` specifying which set each piece is from as one of
# the features.

chess = spark.table('chess.four_chess_sets')

# Use `StringIndexer` to convert `set` from string codes to
# numeric codes
indexer = StringIndexer(inputCol="set", outputCol="set_ix")
indexer_model = indexer.fit(chess)
list(enumerate(indexer_model.labels))
indexed = indexer_model.transform(chess)

# Depending on the model, we might also need to apply another
# like the `OneHotEncoder` to generate a set of dummy variables
encoder = OneHotEncoder(inputCol="set_ix", outputCol="set_cd")
encoded = encoder.transform(indexed)

selected = encoded.select('base_diameter', 'height', 'set_cd', 'weight')
feature_columns = ['base_diameter', 'height', 'set_cd']

# We must assemble the features into a single column of vectors:
assembler = VectorAssembler(inputCols=feature_columns, outputCol="features")
assembled = assembler.transform(selected)

(train, test) = assembled.randomSplit([0.8, 0.2])
lr = RandomForestRegressor(featuresCol="features", labelCol="weight")

lr_model = lr.fit(train)

test_with_predictions = lr_model.transform(test)
from pyspark.ml.feature import OneHotEncoder, StringIndexer, VectorAssembler
from pyspark.ml import Pipeline

categoricalColumns = ["UNIQUE_CARRIER", "ORIGIN", "DEST"]
numericalColumns = ["DISTANCE"]

# Convert string categorical columns to indexed integers
indexers = [
	StringIndexer(inputCol=c, outputCol ="{0}_indexed".format(c))
	for c in categoricalColumns
]

# OneHot Encoding
encoders = [
	OneHotEncoder(
		inputCol=indexer.getOutputCol(),
		outputCol ="{0}_encoded".format(indexer.getOutputCol())
		)
	for indexer in indexers
]

# Assembler for categorical columns
assemblerCategorical = VectorAssembler(inputCols=[encoder.getOutputCol() for encoder in encoders], outputCol= "cat")
stages = indexers+encoders+ [assemblerCategorical]
pipelineCategorical = Pipeline(stages=stages)
df = pipelineCategorical.fit(df).transform(df)

# Assembler for Numerical columns
assemblerNumerical = VectorAssembler(inputCols = numericalColumns, outputCol = "num")
pipelineNumerical = Pipeline(stages = [assemblerNumerical])
df = pipelineNumerical.fit(df).transform(df)
예제 #22
0
df_0 = df_pandas[df_pandas['Harm'] == 0]
df_1 = df_pandas[df_pandas['Harm'] == 1]
df_2 = df_pandas[df_pandas['Harm'] == 2]
df_1_new = df_1.sample(frac=1.64, replace=True)
df_2_new = df_2.sample(frac=4.3, replace=True)
df_pandas = pd.concat([df_0, df_1_new, df_2_new], ignore_index=True)
df_pandas.sample(frac=1)
print(df_pandas['Harm'].value_counts())
df = spark.createDataFrame(df_pandas)
df.toPandas()

# In[8]:

# Preparing for machine learning
cbwd_Indexer = StringIndexer(inputCol='cbwd', outputCol='cbwdIndex')
cbwd_encoder = OneHotEncoder(inputCol='cbwdIndex', outputCol='cbwdVec')

Harm_Indexer = StringIndexer(inputCol='Harm', outputCol='label')

assembler = VectorAssembler(
    inputCols=['DEWP', 'HUMI', 'PRES', 'cbwdVec', 'TEMP', 'Iws'],
    outputCol="features")

# In[9]:

# Pipeline
pipeline = Pipeline(
    stages=[cbwd_Indexer, Harm_Indexer, cbwd_encoder, assembler])
pipeline_model = pipeline.fit(df)
pipe_df = pipeline_model.transform(df)
pipe_df = pipe_df.select('label', 'features')
예제 #23
0
# The following code does three things with pipeline:
# 
# * **`StringIndexer`** all categorical columns
# * **`OneHotEncoder`** all categorical index columns
# * **`VectorAssembler`** all feature columns into one vector column

# ### Categorical columns

from pyspark.ml.feature import StringIndexer, OneHotEncoder, VectorAssembler
from pyspark.ml import Pipeline
import pyspark.sql.functions as F

# categorical columns
categorical_columns = cuse.columns[0:3]
stage_string = [StringIndexer(inputCol= c, outputCol= c+"_string_encoded") for c in categorical_columns]
stage_one_hot = [OneHotEncoder(inputCol= c+"_string_encoded", outputCol= c+ "_one_hot") for c in categorical_columns]

ppl = Pipeline(stages=stage_string + stage_one_hot)
df = ppl.fit(cuse).transform(cuse)
df.toPandas().to_csv('cuse_afterTransform.csv')
df.select("age", 'age_string_encoded').distinct().sort(F.asc("age_string_encoded")).show()
df.select("education").distinct().show()
df.select("wantsMore").distinct().show()

# In[2]:
# ### Build VectorAssembler stage
df.columns

assembler = VectorAssembler(
  inputCols=['age_one_hot',
             'education_one_hot',
# 
# Let's break this down into multiple steps to make it all clear.

# In[12]:


from pyspark.ml.feature import (VectorAssembler,VectorIndexer,
                                OneHotEncoder,StringIndexer)


# In[13]:


#indexers = [StringIndexer(inputCol=column, outputCol=column+"_index").fit(df).transform(df) for column in df.columns ]
gender_indexer = StringIndexer(inputCol='Sex',outputCol='SexIndex')
gender_encoder = OneHotEncoder(inputCol='SexIndex',outputCol='SexVec')


# In[14]:


embark_indexer = StringIndexer(inputCol='Embarked',outputCol='EmbarkIndex')
embark_encoder = OneHotEncoder(inputCol='EmbarkIndex',outputCol='EmbarkVec')


# In[15]:


assembler = VectorAssembler(inputCols=['Pclass',
 'SexVec',
 'Age',
예제 #25
0
def oneHot(df, base_col_name, col_name):
    from pyspark.sql import SparkSession
    from pyspark import SparkContext, SparkConf
    from pyspark.sql import SparkSession
    import os
    import time

    #os.environ['SPARK_HOME'] = '/root/spark-2.1.1-bin'

    sparkConf = SparkConf() \
        .setAppName('pyspark rentmodel') \
        .setMaster('local[*]')
    sc = SparkContext.getOrCreate(sparkConf)

    sc.setLogLevel('WARN')

    spark = SparkSession(sparkContext=sc)

    df = df.select(base_col_name, col_name)
    df = df.filter(df[base_col_name].isNotNull())
    # StringIndexer'handleInvalid of python'version no have 'keep',so it can't process null value
    null_col_name = col_name + '_null'
    df = df.na.fill(null_col_name, col_name)
    df_NULL = df.filter(df[col_name] == 'NULL')

    df = df.filter(df[col_name].isNotNull())
    df = df.filter(df[col_name] != '')
    print('one-hot=======', col_name, df.count())

    temp_path = '/data/20180621/ALL_58_beijing_save_models/'

    if df_NULL.count() > 0:

        def udf_NULL(s):
            return null_col_name

        udf_transf = udf(udf_NULL)

        df_NULL = df_NULL.select('*',
                                 udf_transf(col_name).alias('tmp_col_name'))
        df_NULL = df_NULL.na.fill(null_col_name, 'tmp_col_name')
        df_NULL = df_NULL.drop(col_name)
        df_NULL = df_NULL.withColumnRenamed('tmp_col_name', col_name)

        df_no_NULL = df.filter(df[col_name] != 'NULL')
        df_no_NULL = df_no_NULL.withColumn('tmp_col_name', df[col_name])
        df_no_NULL = df_no_NULL.drop(col_name)
        df_no_NULL = df_no_NULL.withColumnRenamed('tmp_col_name', col_name)
        df = df_no_NULL.union(df_NULL)
        del df_no_NULL

    index_name = col_name + 'Index'
    vector_name = col_name + 'Vec'
    """
        StringIndexer可以设置handleInvalid='skip',但是不可以设置handleInvalid='keep'.
        设置这个会删除需要跳过的这一行,这样会导致用户体验差,因为用户输入
        一条数据,就直接给删了,什么都没有。因此暂不设置,新数据输入时,如果没有,
        可以在已经有的字符串中随机选择一个来替换没有的这个新字符串.
    """
    stringIndexer = StringIndexer(inputCol=col_name, outputCol=index_name)
    model = stringIndexer.fit(df)
    indexed = model.transform(df)
    encoder = OneHotEncoder(dropLast=False,
                            inputCol=index_name,
                            outputCol=vector_name)
    encoded = encoder.transform(indexed)

    #save
    stringIndexer.save(temp_path + 'stringIndexer' + col_name)
    model.save(temp_path + 'stringIndexer_model' + col_name)

    # StringIndexer(inputCol=col_name, outputCol=index_name)
    # onehotEncoderPath = temp_path + col_name
    # loadedEncoder = OneHotEncoder.load(onehotEncoderPath)
    # loadedEncoder.setParams(inputCol=index_name, outputCol=vector_name)
    # encoded = loadedEncoder.transform(df)
    # encoded.show()

    onehotEncoderPath = temp_path + col_name + '_new'
    encoder.save(onehotEncoderPath)

    sub_encoded = encoded.select(base_col_name, vector_name)

    return sub_encoded
from pyspark.ml.feature import Imputer

df3 = Imputer(inputCols=['Age','Fare'], outputCols=['Age1','Fare1']).fit(df3).transform(df3)

df3.show(3)
#--------------------------------


# df3 = df2.select('Sex',df2.Pclass.cast('double'),df2.Survived.cast('double'),'Embarked',df2.Fare.cast('double'),df2.Age.cast('double'))

from pyspark.ml.feature import StringIndexer, OneHotEncoder, VectorAssembler

df3 = StringIndexer(inputCol='Embarked',outputCol='Embarked1').fit(df3).transform(df3)
df3.show()

df3 = OneHotEncoder(inputCol='Embarked1',outputCol='Embarked2',dropLast=False).transform(df3)
df3.show()

# --------------------------------------------

df3 = StringIndexer(inputCol='Sex',outputCol='Gender').fit(df3).transform(df3)
df3 = OneHotEncoder(inputCol='Gender',outputCol='Gender1',dropLast=False).transform(df3)
df3.show(5)

# cast to double
#df3 = df3.select(df3.Pclass.cast('double'),df3.Gender1,df3.Embarked2,df3.Survived.cast('double'))
#df3.printSchema()


# Vector assembler
# COMMAND ----------

# MAGIC %md The ML package needs the label and feature vector to be added as columns to the input dataframe. We set up a pipeline to pass the data through transformers in order to extract the features and label. We index each categorical column using the `StringIndexer` to a column of number indices, then convert the indexed categories into one-hot encoded variables with at most a single one-value. These binary vectors are appended to the end of each row. Encoding categorical features allows decision trees to treat categorical features appropriately, improving performance. We then use the `StringIndexer` to encode our labels to label indices.

# COMMAND ----------

categoricalColumns = ["OriginAirportCode", "Carrier", "DestAirportCode"]
stages = []  # stages in our Pipeline
for categoricalCol in categoricalColumns:
    # Category Indexing with StringIndexer
    stringIndexer = StringIndexer(inputCol=categoricalCol,
                                  outputCol=categoricalCol + "Index")
    # Use OneHotEncoderEstimator to convert categorical variables into binary SparseVectors
    # encoder = OneHotEncoderEstimator(dropLast=False, inputCols=[stringIndexer.getOutputCol()], outputCols=[categoricalCol + "classVec"])
    # Using the slightly older OneHotEncoder (instead of OneHotEncoderEstimator) for compatibility reasons when operationalizing within the DSVM
    encoder = OneHotEncoder(inputCol=stringIndexer.getOutputCol(),
                            outputCol=categoricalCol + "classVec")
    # Add stages.  These are not run here, but will run all at once later on.
    stages += [stringIndexer, encoder]

# Convert label into label indices using the StringIndexer
label_stringIdx = StringIndexer(inputCol="DepDel15", outputCol="label")
stages += [label_stringIdx]

# COMMAND ----------

# MAGIC %md Now we need to use the `VectorAssembler` to combine all the feature columns into a single vector column. This includes our numeric columns as well as the one-hot encoded binary vector columns.

# COMMAND ----------

# Transform all features into a vector using VectorAssembler
numericCols = [
예제 #28
0
df.createOrReplaceTempView("df")
spark.sql("SELECT * from df").show()

# Please create a VectorAssembler which consumed columns X, Y and Z and produces a column “features”
#

# In[31]:

from pyspark.ml.feature import StringIndexer
from pyspark.ml.feature import OneHotEncoder
from pyspark.ml.feature import VectorAssembler
from pyspark.ml.feature import Normalizer

indexer = StringIndexer(inputCol="CLASS", outputCol="label")
encoder = OneHotEncoder(inputCol="label", outputCol="labelVec")
vectorAssembler = VectorAssembler(inputCols=["X", "Y", "Z"],
                                  outputCol="features")
normalizer = Normalizer(inputCol="features", outputCol="features_norm", p=1.0)

# Please insatiate a classifier from the SparkML package and assign it to the classifier variable. Make sure to either
# 1.	Rename the “CLASS” column to “label” or
# 2.	Specify the label-column correctly to be “CLASS”
#

# In[55]:

# LogisticReg accuracy was 54.7% Naive Bayes requires non-negative data
# from pyspark.ml.classification import LogisticRegression
# classifier = LogisticRegression(maxIter=200, regParam=0.2, elasticNetParam=0.8)
def make_regr_model(data, sc, model_path, model_name, target, ml_model='default', save=True):

    t0 = time()
    # Stages for pipline
    stages = []

    # Split the data into training and test sets (30% held out for testing)
    (trainingData, testData) = data.randomSplit([0.7, 0.3])

    # Identify categorical and numerical variables
    catCols = [x for (x, dataType) in trainingData.dtypes if ((dataType == "string") | (dataType == "boolean"))]

    numCols = [x for (x, dataType) in trainingData.dtypes if (((dataType == "int") | (dataType == "bigint")
                                                                 | (dataType == "float") | (dataType == "double"))
               & (x != "target"))]

    # OneHotEncode categorical variables
    indexers = [StringIndexer(inputCol=column, outputCol=column + "-index", handleInvalid="keep") for column in catCols]

    encoder = OneHotEncoder(
        inputCols=[indexer.getOutputCol() for indexer in indexers],
        outputCols=["{0}-encoded".format(indexer.getOutputCol()) for indexer in indexers]
    )
    assembler_cat = VectorAssembler(
        inputCols=encoder.getOutputCols(),
        outputCol="categorical-features",
        handleInvalid="skip"
    )

    stages += indexers
    stages += [encoder, assembler_cat]



    assembler_num = VectorAssembler(
        inputCols=numCols,
        outputCol="numerical-features",
        handleInvalid="skip"
    )

    # Standardize numerical variables
    scaler = StandardScaler(inputCol="numerical-features", outputCol="numerical-features_scaled")

    # Combine all features in one vector
    assembler_all = VectorAssembler(
        inputCols=['categorical-features', 'numerical-features_scaled'],
        outputCol='features',
        handleInvalid="skip"
    )

    stages += [assembler_num, scaler, assembler_all]

    # Train a RandomForest model.
    if ml_model == 'default':
        rf = RandomForestRegressor(labelCol="target", featuresCol="features")
    else:
        rf = ml_model

    stages += [rf]

    # Chain indexers and forest in a Pipeline
    pipeline = Pipeline(stages=stages)

    # Train model.  This also runs the indexers.
    model = pipeline.fit(trainingData)

    # Make predictions.
    predictions = model.transform(testData)

    # Select example rows to display.
    #predictions.select("prediction", "target", "features").show(5)

    # Select (prediction, true label) and compute test error
    evaluator = RegressionEvaluator(
        labelCol="target", predictionCol="prediction", metricName="rmse")
    rmse = evaluator.evaluate(predictions)
    print("RMSE = %g" % (0.0 + rmse))

    if save:
        # Final model saving and statistics writing
        tt = time() - t0
        timestamp = int(time())
        model.write().overwrite().save(model_path)

        cluster = Cluster(['127.0.0.1'], "9042")
        session = cluster.connect("models")
        query = ("INSERT INTO %s (model_name, timestamp, target, learning_time, model_path, stat)") % ("models_statistics")
        query = query + " VALUES (%s, %s, %s, %s, %s, %s)"
        session.execute(query, (model_name, timestamp, target, tt, model_path, rmse))
        session.shutdown()
        cluster.shutdown()

        # Stop spark session
        sc.stop()

    if not save:
        return model, sc
예제 #30
0

# StringIndexer
from pyspark.ml.feature import StringIndexer

categeriesIndexer = StringIndexer(inputCol="job",outputCol="job_index")
categeriesTransformer = categeriesIndexer.fit(df)

print(categeriesTransformer.labels)

# OneHot encoder

from pyspark.ml.feature import OneHotEncoder
from pyspark.ml.feature import VectorAssembler

onehotencoder = OneHotEncoder(inputCol='categoryIndex', outputCol='categoryVec')
oncoded = onehotencoder.transform(indexed)
assember = VectorAssembler(inputCol=["age","user_id"],outputCol="features")


from pyspark.ml.classificaton import DecisionTreeClassifier
dt_clf = DecisionTreeClassifier(labelCol="label",featureCol="feature",impurity="gini",maxDepth=5,maxBins=5)
model = dt_clf.fit(df)

from pyspark.ml import Pipeline
pipeline = Pipeline(stages=[categeriesIndexer,onehotencoder,assember,dt_clf])
pipeline.getStages()
pipeModel = pipeline.fit(train_df)
predicted = pipeModel.transform(test_df)

print(predicted.columns)
예제 #31
0
numeric_col = ["qty_reference"]

imputer = Imputer(inputCols=numeric_col,
                  outputCols=["{}_imputed".format(c) for c in numeric_col])

categorical_col = ["SITE_FORMAT", "season"]

indexers = [
    StringIndexer(inputCol=c,
                  outputCol="{0}_indexedd".format(c),
                  handleInvalid='skip') for c in categorical_col
]

encoders = [
    OneHotEncoder(dropLast=True,
                  inputCol=indexer.getOutputCol(),
                  outputCol="{0}_encodedd".format(indexer.getOutputCol()))
    for indexer in indexers
]

assembler = VectorAssembler(inputCols= [encoder.getOutputCol() for encoder in encoders] + \
 [x +'_imputed' for x in numeric_col] + ['day', 'month', 'weekday', 'weekend', 'monthend', 'monthbegin', 'monthquarter', 'yearquarter'],
                            outputCol="Features")

pca = PCA(k=5, inputCol="Features", outputCol="pcaFeatures")

pipeline = Pipeline(stages = [dex, mex, yex, wdex, wex, meex, vex, mbex, mqex, yqex, ydex] + \
    [imputer] + \
    indexers + \
    encoders +  \
    [assembler]+ \
예제 #32
0
def build_indep_vars(df,
                     independent_vars,
                     categorical_vars=None,
                     keep_intermediate=False,
                     summarizer=True):
    """
    Data verification
    df               : DataFrame
    independent_vars : List of column names
    categorical_vars : None or list of column names, e.g. ['col1', 'col2']
    """
    assert (
        type(df) is pyspark.sql.dataframe.DataFrame
    ), 'pypark_glm: A pySpark dataframe is required as the first argument.'
    assert (
        type(independent_vars) is list
    ), 'pyspark_glm: List of independent variable column names must be the third argument.'
    for iv in independent_vars:
        assert (
            type(iv) is str
        ), 'pyspark_glm: Independent variables must be column name strings.'
        assert (
            iv in df.columns
        ), 'pyspark_glm: Independent variable name is not a dataframe column.'
    if categorical_vars:
        for cv in categorical_vars:
            assert (
                type(cv) is str
            ), 'pyspark_glm: Categorical variables must be column name strings.'
            assert (
                cv in df.columns
            ), 'pyspark_glm: Categorical variable name is not a dataframe column.'
            assert (
                cv in independent_vars
            ), 'pyspark_glm: Categorical variables must be independent variables.'
    """
    Code
    """
    from pyspark.ml import Pipeline
    from pyspark.ml.feature import StringIndexer, OneHotEncoder, VectorAssembler
    from pyspark.ml.regression import GeneralizedLinearRegression

    if categorical_vars:
        string_indexer = [
            StringIndexer(inputCol=x, outputCol='{}_index'.format(x))
            for x in categorical_vars
        ]

        encoder = [
            OneHotEncoder(dropLast=True,
                          inputCol='{}_index'.format(x),
                          outputCol='{}_vector'.format(x))
            for x in categorical_vars
        ]

        independent_vars = [
            '{}_vector'.format(x) if x in categorical_vars else x
            for x in independent_vars
        ]
    else:
        string_indexer, encoder = [], []

    assembler = VectorAssembler(inputCols=independent_vars,
                                outputCol='indep_vars')
    pipeline = Pipeline(stages=string_indexer + encoder + [assembler])
    model = pipeline.fit(df)
    df = model.transform(df)

    #for building the crosswalk between indicies and column names
    if summarizer:
        param_crosswalk = {}

        i = 0
        for x in independent_vars:
            if '_vector' in x[-7:]:
                xrs = x.rstrip('_vector')
                dst = df[[xrs, '{}_index'.format(xrs)]].distinct().collect()

                for row in dst:
                    param_crosswalk[int(row['{}_index'.format(xrs)] +
                                        i)] = row[xrs]
                maxind = max(param_crosswalk.keys())
                del param_crosswalk[maxind]  #for droplast
                i += len(dst)
            elif '_index' in x[:-6]:
                pass
            else:
                param_crosswalk[i] = x
                i += 1
        """
        {0: 'carat',
         1: u'SI1',
         2: u'VS2',
         3: u'SI2',
         4: u'VS1',
         5: u'VVS2',
         6: u'VVS1',
         7: u'IF'}
        """
        make_summary = Summarizer(param_crosswalk)

    if not keep_intermediate:
        fcols = [
            c for c in df.columns
            if '_index' not in c[-6:] and '_vector' not in c[-7:]
        ]
        df = df[fcols]

    if summarizer:
        return df, make_summary
    else:
        return df
# In[101]:

df_model=df_ORG
# stringIndexer1 = StringIndexer(inputCol="Origin", outputCol="originIndex")
# model_stringIndexer = stringIndexer1.fit(df_model)
# indexedOrigin = model_stringIndexer.transform(df_model)
# encoder1 = OneHotEncoder(dropLast=False, inputCol="originIndex", outputCol="originVec")
# df_model = encoder1.transform(indexedOrigin)


# In[ ]:

stringIndexer2 = StringIndexer(inputCol="Dest", outputCol="destIndex")
model_stringIndexer = stringIndexer2.fit(df_model)
indexedDest = model_stringIndexer.transform(df_model)
encoder2 = OneHotEncoder(dropLast=False, inputCol="destIndex", outputCol="destVec")
df_model = encoder2.transform(indexedDest)


# We use __labeled point__ to make local vectors associated with a label/response. In MLlib, labeled points are used in supervised learning algorithms and they are stored as doubles. For binary classification, a label should be either 0 (negative) or 1 (positive). 

# In[105]:

assembler = VectorAssembler(
    inputCols = ['Year','Month','DayofMonth','DayOfWeek','Hour','Distance','destVec'],
    outputCol = "features")
output = assembler.transform(df_model)
airlineRDD=output.map(lambda row: LabeledPoint([0,1][row['DepDelayed']],row['features']))


# ### Preprocessing: Spliting dataset into train and test dtasets
     WHEN (pickup_hour >= 11 AND pickup_hour <= 15) THEN "Afternoon"
     WHEN (pickup_hour >= 16 AND pickup_hour <= 19) THEN "PMRush"
    END as TrafficTimeBins
    FROM taxi_test 
"""
taxi_df_test_with_newFeatures = sqlContext.sql(sqlStatement)

## CACHE DATA-FRAME IN MEMORY & MATERIALIZE DF IN MEMORY
taxi_df_test_with_newFeatures.cache()
taxi_df_test_with_newFeatures.count()

## INDEX AND ONE-HOT ENCODING
stringIndexer = StringIndexer(inputCol="vendor_id", outputCol="vendorIndex")
model = stringIndexer.fit(taxi_df_test_with_newFeatures) # Input data-frame is the cleaned one from above
indexed = model.transform(taxi_df_test_with_newFeatures)
encoder = OneHotEncoder(dropLast=False, inputCol="vendorIndex", outputCol="vendorVec")
encoded1 = encoder.transform(indexed)

stringIndexer = StringIndexer(inputCol="rate_code", outputCol="rateIndex")
model = stringIndexer.fit(encoded1)
indexed = model.transform(encoded1)
encoder = OneHotEncoder(dropLast=False, inputCol="rateIndex", outputCol="rateVec")
encoded2 = encoder.transform(indexed)

stringIndexer = StringIndexer(inputCol="payment_type", outputCol="paymentIndex")
model = stringIndexer.fit(encoded2)
indexed = model.transform(encoded2)
encoder = OneHotEncoder(dropLast=False, inputCol="paymentIndex", outputCol="paymentVec")
encoded3 = encoder.transform(indexed)

stringIndexer = StringIndexer(inputCol="TrafficTimeBins", outputCol="TrafficTimeBinsIndex")
  (Vectors.dense(2, 5, 6),2),
  (Vectors.dense(1, 8, 9),3)
]).toDF("features", "label")
indxr = VectorIndexer()\
  .setInputCol("features")\
  .setOutputCol("idxed")\
  .setMaxCategories(2)
indxr.fit(idxIn).transform(idxIn).show()


# COMMAND ----------

from pyspark.ml.feature import OneHotEncoder, StringIndexer
lblIndxr = StringIndexer().setInputCol("color").setOutputCol("colorInd")
colorLab = lblIndxr.fit(simpleDF).transform(simpleDF.select("color"))
ohe = OneHotEncoder().setInputCol("colorInd")
ohe.transform(colorLab).show()


# COMMAND ----------

from pyspark.ml.feature import Tokenizer
tkn = Tokenizer().setInputCol("Description").setOutputCol("DescOut")
tokenized = tkn.transform(sales.select("Description"))
tokenized.show(20, False)


# COMMAND ----------

from pyspark.ml.feature import RegexTokenizer
rt = RegexTokenizer()\
예제 #36
0
  def initialize(self, do_scaling=True, do_onehot=True):
    """Reads the dataset, initializes class members.

    features_df: Original DataFrame as read from the features_file.
    train_df: A DataFrame with columns Lat, Lon, Pickup_Count and
        vector columns Features & ScaledFeatures. Contains only data before 2015.
    test_df: As train_df, but only containing data of 2015.
    districts_with_counts: A DataFrame with all districts and their counts.
    """

    # Read feature dataframe
    self.features_df = self.sql_context.read.parquet(self.features_file).cache()

    # Set exclude columns to default
    exclude_columns = self.EXCLUDE_COLUMNS

    # Scale features
    if do_scaling:
      assembler = VectorAssembler(inputCols=self.SCALE_COLUMNS,
                                  outputCol='FeaturesToScale')
      self.features_df = assembler.transform(self.features_df)
      scaler = StandardScaler(inputCol='FeaturesToScale',
                              outputCol=('ScaledFeatures'),
                              withStd=True, withMean=False)
      self.features_df = scaler.fit(self.features_df).transform(self.features_df)

      exclude_columns += self.SCALE_COLUMNS + ['FeaturesToScale']

    # Adopt categorical features that do not have a value range of [0, numCategories)
    for column in ['Day', 'Month', 'Day_Of_Year']:
        if column in self.features_df.columns:
            self.features_df = self.features_df.withColumn(column, self.features_df[column] - 1)

    # Encode categorical features using one-hot encoding
    if do_onehot:
      vec_category_columns = ['%s_Vector' % column for column in self.ONE_HOT_COLUMNS]
      for i in range(len(self.ONE_HOT_COLUMNS)):
        column = self.ONE_HOT_COLUMNS[i]
        if column in self.features_df.columns:
            self.features_df = self.features_df.withColumn(column, self.features_df[column].cast(DoubleType()))
            encoder = OneHotEncoder(inputCol=column,
                                    outputCol=vec_category_columns[i],
                                    dropLast=False)
            self.features_df = encoder.transform(self.features_df)
      exclude_columns += self.ONE_HOT_COLUMNS

    # Vectorize features
    feature_columns = [column for column in self.features_df.columns
                              if column not in exclude_columns]
    assembler = VectorAssembler(inputCols=feature_columns, outputCol='Features')
    self.features_df = assembler.transform(self.features_df)

    # Set number of distinct values for categorical features (identified by index)
    self.categorical_features_info = {}
    if not do_onehot:
        self.categorical_features_info = {i:self.CATEGORY_VALUES_COUNT[feature_columns[i]]
                                          for i in range(len(feature_columns))
                                          if feature_columns[i] in self.CATEGORY_VALUES_COUNT.keys()}

    # Split into train and test data
    split_date = datetime(2015, 1, 1)
    self.train_df = self.features_df.filter(self.features_df.Time < split_date).cache()
    self.test_df = self.features_df.filter(self.features_df.Time > split_date).cache()

    # Compute Districts with counts
    self.districts_with_counts = self.features_df \
                                 .groupBy([self.features_df.Lat, self.features_df.Lon]) \
                                 .count()
#Encoding Categorical Variable 
from pyspark.sql.functions import *
categoricalCol=["SEX","MARRIAGE","AGE","EDUCATION","PAY_0","PAY_2","PAY_3","PAY_4","PAY_5","PAY_6"]

from pyspark.ml.feature import OneHotEncoder, StringIndexer
from pyspark.ml import Pipeline

for c in categoricalCol:
  str1=c+"_Index"
  str2=c+"_Vec"
  stringIndexer=StringIndexer().setInputCol(c).setOutputCol(str1)
  model = stringIndexer.fit(trans_df3)
  
  indexed = model.transform(trans_df3)
  encoder= OneHotEncoder().setInputCol(str1).setOutputCol(str2)
  trans_df3=encoder.transform(indexed)

trans_df3.show(3)


from pyspark.ml.linalg import Vectors
from pyspark.ml.feature import VectorAssembler

#assembler = VectorAssembler(
#    inputCols=["LIMIT_BAL", "SEX", "EDUCATION","MARRIAGE","AGE"],
#    outputCol="features")

assembler=VectorAssembler(inputCols=["LIMIT_BAL","BILL_AMT1","BILL_AMT2","BILL_AMT3","BILL_AMT4","BILL_AMT5","BILL_AMT6", \
                                    "PAY_AMT1","PAY_AMT2","PAY_AMT3","PAY_AMT4","PAY_AMT5","PAY_AMT6","SEX_Vec","MARRIAGE_Vec", \
                                    "AGE_Vec","EDUCATION_Vec","PAY_0_Vec","PAY_2_Vec","PAY_3_Vec","PAY_4_Vec","PAY_5_Vec","PAY_6_Vec"],outputCol="features")
예제 #38
0
def transform(df, spark, sql_query = None, numerical_features = [], categorical_features = [],\
              normalize = True, normalize_p=2):

    # Apply SQL query
    if sql_query != None:

        df.createOrReplaceTempView("netlytics")
        # Execute Query
        result_df = spark.sql(sql_query)
        df = result_df

    # Transform Strings in OneHot
    schema = df.schema
    feat_to_type = {}
    for struct in schema:
        feat_to_type[struct.name] = str(struct.dataType)

    for feature in categorical_features:

        # Replaces None
        k = col(feature)
        df = df.withColumn(feature, when(k.isNull(), "__NA__").otherwise(k))

        stringIndexer = StringIndexer(inputCol=feature,
                                      outputCol=feature + "_indexed",
                                      handleInvalid="skip")
        model = stringIndexer.fit(df)
        df = model.transform(df)

        encoder = OneHotEncoder(inputCol=feature + "_indexed",
                                outputCol=feature + "_encoded")
        df = encoder.transform(df)

    # Extract Features
    def extract_features(row, numerical_features, feat_to_type):
        output_features = {}

        fields = list(row.asDict().keys())
        for field in fields:
            if field in numerical_features and feat_to_type[
                    field] != "StringType":
                output_features[field] = float(row[field])
            if field.endswith("_encoded"):
                output_list = list(row[field])
                for i, v in enumerate(output_list):
                    tmp_field = field + "_" + str(i)
                    output_features[tmp_field] = float(v)

        features = [
            v for k, v in sorted(output_features.items(),
                                 key=operator.itemgetter(0))
        ]

        old_dict = row.asDict()
        old_dict["features"] = DenseVector(features)
        new_row = Row(**old_dict)
        return new_row

    #spark = df.rdd.
    rdd = df.rdd.map(
        lambda row: extract_features(row, numerical_features, feat_to_type))
    df = spark.createDataFrame(rdd, samplingRatio=1, verifySchema=False)

    # Normalize
    if normalize:
        normalizer = Normalizer(inputCol="features",
                                outputCol="featuresNorm",
                                p=normalize_p)
        df = normalizer.transform(df)
        df = df.drop("features")
        df = df.withColumnRenamed("featuresNorm", "features")

    # Delete intermediate columns:
    schema = df.schema
    feat_to_type = {}
    for struct in schema:
        feat_to_type[struct.name] = str(struct.dataType)

    for feature in feat_to_type:
        if feat_to_type[feature] != "StringType":
            if feature.endswith("_encoded") or feature.endswith("_indexed"):
                df = df.drop(feature)

    return df
# MAGIC 
# MAGIC Here, we will use a combination of [StringIndexer](http://spark.apache.org/docs/latest/ml-features.html#stringindexer) and [OneHotEncoder](http://spark.apache.org/docs/latest/ml-features.html#onehotencoder) to convert the categorical variables. The OneHotEncoder will return a [SparseVector](https://spark.apache.org/docs/latest/api/python/pyspark.mllib.html#pyspark.mllib.linalg.SparseVector).

# COMMAND ----------

###One-Hot Encoding
from pyspark.ml.feature import OneHotEncoder, StringIndexer, VectorAssembler
  
categoricalColumns = ["workclass", "education", "marital_status", "occupation", "relationship", "race", "sex", "native_country"]
for categoricalCol in categoricalColumns:
  # Category Indexing with StringIndexer
  stringIndexer = StringIndexer(inputCol=categoricalCol, outputCol=categoricalCol+"Index")
  model = stringIndexer.fit(dataset)
  indexed = model.transform(dataset)
  # Use OneHotEncoder to convert categorical variables into binary SparseVectors
  encoder = OneHotEncoder(inputCol=categoricalCol+"Index", outputCol=categoricalCol+"classVec")
  encoded = encoder.transform(indexed)
  dataset = encoded

print dataset.take(1)

# COMMAND ----------

# MAGIC %md
# MAGIC The above code basically indexes each categorical column using the StringIndexer, and then converts the indexed categories into one-hot encoded variables. The resulting output has the binary vectors appended to the end of each row.

# COMMAND ----------

# MAGIC %md
# MAGIC We use the StringIndexer() again here to encode our labels to label indices
예제 #40
0
파일: main.py 프로젝트: HabibAouani/Cours
                      "Soil_Type9", "Soil_Type10", "Soil_Type11", "Soil_Type12",
                      "Soil_Type13",
                      "Soil_Type14", "Soil_Type16", "Soil_Type17", "Soil_Type18", "Soil_Type19",
                      "Soil_Type20",
                      "Soil_Type21", "Soil_Type22", "Soil_Type23", "Soil_Type24", "Soil_Type26",
                      "Soil_Type27",
                      "Soil_Type28", "Soil_Type29", "Soil_Type30", "Soil_Type31", "Soil_Type32", "Soil_Type33",
                      "Soil_Type34",
                      "Soil_Type35", "Soil_Type36", "Soil_Type37", "Soil_Type38", "Soil_Type39", "Soil_Type40"]
# Soil_Type7,"Soil_Type8",  Soil_Type15, Soil_Type25 : always the same value

for categoricalCol in categoricalColumns:
    # Category Indexing with StringIndexer
    stringIndexer = StringIndexer(inputCol=categoricalCol, outputCol=categoricalCol + "Index")
    # Use OneHotEncoder to convert categorical variables into binary SparseVectors
    encoder = OneHotEncoder(inputCol=categoricalCol + "Index", outputCol=categoricalCol + "classVec")
    # Add stages.  These are not run here, but will run all at once later on.
    stages += [stringIndexer, encoder]

# Numerical columns : create vecAssembler
numericCols = ["Elevation", "Aspect", "Slope", "Horizontal_Distance_To_Hydrology", "Vertical_Distance_To_Hydrology",
                 "Horizontal_Distance_To_Roadways", "Hillshade_9am", "Hillshade_Noon", "Hillshade_3pm",
                 "Horizontal_Distance_To_Fire_Points", "Wilderness_Area1", "Wilderness_Area2", "Wilderness_Area3",
                 "Wilderness_Area4"]
# Transform all features into a vector using VectorAssembler
assemblerInputs = list(map(lambda c: c + "classVec", categoricalColumns)) + numericCols
vecAssembler = VectorAssembler(inputCols=assemblerInputs, outputCol="features")
stages += [vecAssembler]

# Split existing trainingData into training and test sets (30% held out for testing)
(training, test) = trainingData.randomSplit([0.7, 0.3], seed=1234)