示例#1
0
def encoding2(df, incol, outcol):

    encoder = OneHotEncoderEstimator(inputCols=[incol], outputCols=[outcol])
    encoder = encoder.fit(df)
    df = encoder.transform(df)

    return df  #, encoder
示例#2
0
def encoding(i, df, col):

    encoder = OneHotEncoderEstimator(inputCols=[col],
                                     outputCols=["p" + str(i)])
    encoder = encoder.fit(df)
    df = encoder.transform(df)

    return df  #, encoder
示例#3
0
def one_hot_encoder_estimator(dataset,inputCols,outputCols = None):
    from pyspark.ml.feature import OneHotEncoderEstimator
    if outputCols == None:
        outputCols = inputCols + '_ohee'
    model = OneHotEncoderEstimator(inputCols=inputCols, outputCols=outputCols).fit(dataset)
    return model.transform(dataset), model
示例#4
0
# Exercise_1 
# Import the one hot encoder class
from pyspark.ml.feature import OneHotEncoderEstimator

# Create an instance of the one hot encoder
onehot = OneHotEncoderEstimator(inputCols=['org_idx'], outputCols=['org_dummy'])

# Apply the one hot encoder to the flights data
onehot = onehot.fit(flights)
flights_onehot = onehot.transform(flights)

# Check the results
flights_onehot.select('org', 'org_idx', 'org_dummy').distinct().sort('org_idx').show()

--------------------------------------------------
# Exercise_2 
from pyspark.ml.regression import LinearRegression
from pyspark.ml.evaluation import RegressionEvaluator

# Create a regression object and train on training data
regression = LinearRegression(labelCol='duration').fit(flights_train)

# Create predictions for the testing data and take a look at the predictions
predictions = regression.transform(flights_test)
predictions.select('duration', 'prediction').show(5, False)

# Calculate the RMSE
RegressionEvaluator(labelCol='duration').evaluate(predictions)

--------------------------------------------------
# Exercise_3 
示例#5
0
indexer = StringIndexer(inputCol='type',
                        outputCol='type_idx')

# Assign index values to strings
indexer = indexer.fit(cars)
# Create column with index values
cars = indexer.transform(cars)

cars = cars.withColumn('density', round(cars.weight_kg / cars.length_meters, 2))
cars = cars.withColumn('density_area', round(cars.weight_kg / cars.length_meters**2, 2))
cars = cars.withColumn('density_volume', round(cars.weight_kg / cars.length_meters**3, 2))

onehot = OneHotEncoderEstimator(inputCols=['type_idx'], outputCols=['type_dummy'])
onehot = onehot.fit(cars)
cars = onehot.transform(cars)

pd.set_option('display.max_columns', None) # all cols
pd.set_option('display.width', 161)
pd.set_option('display.max_colwidth', 199)
#print(cars.toPandas().sample(12))

# Check column data types
print('\n', cars.dtypes, '\n')

assembler = VectorAssembler(inputCols=['weight_kg', 'cyl', 'type_dummy', 'density', 'density_area', 'density_volume'],
                            outputCol='features')
cars = assembler.transform(cars)

kars = cars.select('consumption', 'features')
# Make predictions on the testing data
prediction = logistic.transform(sms_test)

# Create a confusion matrix, comparing predictions to known labels
prediction.groupBy('label', 'prediction').count().show()

# One-hot encoding
from pyspark.ml.feature import OneHotEncoderEstimator
onehot = OneHotEncoderEstimator(inputCols=['type_idx'], outputCol=['type_dummy'])
# Fit the encoder to the data
onehot = onehot.fit(cars)
# How many category levels?
print(onehot.categorySizes)

cars = onehot.transform(cars)
cars.select('type', 'type_idx', 'type_dummy').distinct().sort('type_idx').show()

# Dense verse sparse
from spark.mllib.linalg import DenseVector, SparseVector
DenseVector([1, 0, 0, 0, 0, 7, 0, 0])
SparseVector(8, [0, 5], [1, 7])

# Import the one hot encoder class
from pyspark.ml.feature import OneHotEncoderEstimator

# Create an instance of the one hot encoder
onehot = OneHotEncoderEstimator(inputCols=['org_idx'], outputCols=['org_dummy'])

# Apply the one hot encoder to the flights data
onehot = onehot.fit(flights)
# Indexer identifies categories in the data
indexer_model = indexer.fit(flights)

# Indexer creates a new column with numeric index values
flights_indexed = indexer_model.transform(flights)

# Repeat the process for the org categorical feature
flites = StringIndexer(inputCol="org", outputCol='org_idx').fit(flights_indexed).transform(flights_indexed)

# Create an instance of the one hot encoder
onehot = OneHotEncoderEstimator(inputCols=["org_idx"], outputCols=["org_dummy"])

# Apply the one hot encoder to the flights data
onehot = onehot.fit(flites)
flites = onehot.transform(flites)

pd.set_option('display.max_columns', None) # all cols
pd.set_option('display.width', 199)
pd.set_option('display.max_colwidth', 199)

# Create buckets at 3 hour intervals through the day
buckets = Bucketizer(splits=[0, 3, 6, 9, 12, 15, 18, 21, 24],
                     inputCol="depart", outputCol="depart_bucket")

# Bucket the departure times
bucketed = buckets.transform(flites)
#bucketed.select("depart", "depart_bucket").show(5)

# Create a one-hot encoder for departure
onehot = OneHotEncoderEstimator(inputCols=["depart_bucket"], outputCols=["depart_dummy"])
indexer = StringIndexer(inputCol='type', outputCol='type_idx')

# Assign index values to strings
indexer = indexer.fit(cars)
# Create column with index values
cars = indexer.transform(cars)

pd.set_option('display.max_columns', None)  # all cols
pd.set_option('display.width', 161)
#print(cars.toPandas().sample(12))

# Check column data types
print('\n', cars.dtypes, '\n')

kars = cars.select('name', 'type', 'type_idx')

print(kars.toPandas().sample(12))

onehot = OneHotEncoderEstimator(inputCols=['type_idx'],
                                outputCols=['type_dummy'])
onehot = onehot.fit(kars)
kars = onehot.transform(kars)
kars.select('type', 'type_idx',
            'type_dummy').distinct().sort('type_idx').show()

print("DenseVector:", DenseVector([1, 0, 0, 0, 0, 7, 0, 0]))
print("SparseVector:", SparseVector(8, {0: 1.0, 5: 7.0}))

spark.stop()
示例#9
0
                         outputCol='idxPclass').fit(dftrain)
dftrain = sipclass.transform(dftrain)
dftrain = dftrain.drop('Pclass')

# In[16]:

dftrain.show()

# In[17]:

from pyspark.ml.feature import OneHotEncoderEstimator
ohe = OneHotEncoderEstimator(handleInvalid='keep',
                             dropLast=True,
                             inputCols=['idxPclass'],
                             outputCols=['ohePclass']).fit(dftrain)
dftrain = ohe.transform(dftrain)
dftrain = dftrain.drop('idxPclass')
dftrain.sample(withReplacement=False, fraction=0.1).limit(20).show()

# In[18]:

from pyspark.ml.feature import VectorAssembler
va = VectorAssembler(
    inputCols=['SibSp', 'Parch', 'Fare', 'impAge', 'ohePclass'],
    outputCol='features')
dftrain = va.transform(dftrain)
dftrain = dftrain.drop('SibSp', 'Parch', 'Fare', 'impAge', 'ohePclass')
dftrain.show()

# In[19]: