예제 #1
0
def build_model(numerical_columns: List[str], categorical_columns: List[str],
                label_col: str, max_iter: int) -> Pipeline:

    indexing_stages = [build_string_indexer(c) for c in categorical_columns]
    indexed_columns = [s.getOutputCol() for s in indexing_stages]
    encoding_stages = [build_one_hot_encoder(c) for c in indexed_columns]

    vector_assembler = VectorAssembler() \
        .setInputCols(numerical_columns + [s.getOutputCol() for s in encoding_stages]) \
        .setOutputCol('features')

    gbt = GBTClassifier()\
        .setFeaturesCol(vector_assembler.getOutputCol())\
        .setLabelCol(label_col)

    return Pipeline()\
        .setStages(indexing_stages + encoding_stages + [vector_assembler, gbt])
예제 #2
0
        onehotenc = OneHotEncoder(inputCol=c, outputCol=c+"-onehot", dropLast=False)
        newdf = onehotenc.transform(newdf).drop(c)
        newdf = newdf.withColumnRenamed(c+"-onehot", c)
    return newdf

dfhot = oneHotEncodeColumns(dfnumeric, ["Take-out","GoodFor_lunch", "GoodFor_dinner", "GoodFor_breakfast"])

dfhot.show(5)

# Taining set
assembler = VectorAssembler(inputCols = list(set(dfhot.columns) | set(['stars','review_count'])), outputCol="features")
train = assembler.transform(dfhot)

# Kmeans set for 5 clusters
knum = 5
kmeans = KMeans(featuresCol=assembler.getOutputCol(), predictionCol="cluster", k=knum, seed=0)
model = kmeans.fit(train)
print "Model Created!"

# See cluster centers:
centers = model.clusterCenters()
print("Cluster Centers: ")
for center in centers:
    print(center)
    
# Apply the clustering model to our data:
prediction = model.transform(train)
prediction.groupBy("cluster").count().orderBy("cluster").show()

# Look at the features of each cluster
customerCluster = {}
print("Creating Splits")
train, test = df.randomSplit([0.7, 0.3])

print("Selected Features Count: {0}".format(len(feature_cols)))
print("Selected Features: {0}".format(feature_cols))

print("Building Pipeline")
categorical_hasher = FeatureHasher(inputCols=categorical_cols,
                                   outputCol="categorical_features",
                                   categoricalCols=categorical_cols)
continuous_vector = VectorAssembler(inputCols=continuous_cols,
                                    outputCol="continuous_vector")
scaler = MinMaxScaler(min=0.0,
                      max=1.0,
                      inputCol=continuous_vector.getOutputCol(),
                      outputCol="continuous_features")
features = VectorAssembler(inputCols=feature_cols, outputCol="features")
regression = LogisticRegression(featuresCol=features.getOutputCol(),
                                labelCol="HasDetections",
                                regParam=0.0,
                                elasticNetParam=0.0,
                                tol=1e-06,
                                threshold=0.5,
                                family="auto")
pipeline = Pipeline(stages=[
    categorical_hasher, continuous_vector, scaler, features, regression
])
evaluator = MulticlassClassificationEvaluator(labelCol="HasDetections",
                                              predictionCol="prediction",
                                              metricName="accuracy")