def build_model(numerical_columns: List[str], categorical_columns: List[str], label_col: str, max_iter: int) -> Pipeline: indexing_stages = [build_string_indexer(c) for c in categorical_columns] indexed_columns = [s.getOutputCol() for s in indexing_stages] encoding_stages = [build_one_hot_encoder(c) for c in indexed_columns] vector_assembler = VectorAssembler() \ .setInputCols(numerical_columns + [s.getOutputCol() for s in encoding_stages]) \ .setOutputCol('features') gbt = GBTClassifier()\ .setFeaturesCol(vector_assembler.getOutputCol())\ .setLabelCol(label_col) return Pipeline()\ .setStages(indexing_stages + encoding_stages + [vector_assembler, gbt])
onehotenc = OneHotEncoder(inputCol=c, outputCol=c+"-onehot", dropLast=False) newdf = onehotenc.transform(newdf).drop(c) newdf = newdf.withColumnRenamed(c+"-onehot", c) return newdf dfhot = oneHotEncodeColumns(dfnumeric, ["Take-out","GoodFor_lunch", "GoodFor_dinner", "GoodFor_breakfast"]) dfhot.show(5) # Taining set assembler = VectorAssembler(inputCols = list(set(dfhot.columns) | set(['stars','review_count'])), outputCol="features") train = assembler.transform(dfhot) # Kmeans set for 5 clusters knum = 5 kmeans = KMeans(featuresCol=assembler.getOutputCol(), predictionCol="cluster", k=knum, seed=0) model = kmeans.fit(train) print "Model Created!" # See cluster centers: centers = model.clusterCenters() print("Cluster Centers: ") for center in centers: print(center) # Apply the clustering model to our data: prediction = model.transform(train) prediction.groupBy("cluster").count().orderBy("cluster").show() # Look at the features of each cluster customerCluster = {}
print("Creating Splits") train, test = df.randomSplit([0.7, 0.3]) print("Selected Features Count: {0}".format(len(feature_cols))) print("Selected Features: {0}".format(feature_cols)) print("Building Pipeline") categorical_hasher = FeatureHasher(inputCols=categorical_cols, outputCol="categorical_features", categoricalCols=categorical_cols) continuous_vector = VectorAssembler(inputCols=continuous_cols, outputCol="continuous_vector") scaler = MinMaxScaler(min=0.0, max=1.0, inputCol=continuous_vector.getOutputCol(), outputCol="continuous_features") features = VectorAssembler(inputCols=feature_cols, outputCol="features") regression = LogisticRegression(featuresCol=features.getOutputCol(), labelCol="HasDetections", regParam=0.0, elasticNetParam=0.0, tol=1e-06, threshold=0.5, family="auto") pipeline = Pipeline(stages=[ categorical_hasher, continuous_vector, scaler, features, regression ]) evaluator = MulticlassClassificationEvaluator(labelCol="HasDetections", predictionCol="prediction", metricName="accuracy")