def transform_data_in_pipeline(df): """ :param df: :return: """ # Initialise pipeline variables stages = [] assembler_inputs = [] # Assemble features vector from Spark dataframe fields assembler = VectorAssembler( inputCols=['x', 'y', 'star_rating_number', 'avg_adr'], outputCol='features') stages += [assembler] assembler_inputs += [assembler.getOutputCol()] # Apply standard scaling with unit std and centroid about the mean scaler = StandardScaler(inputCol=assembler.getOutputCol(), outputCol='scaledFeatures') stages += [scaler] assembler_inputs += [scaler.getOutputCol()] # Execute the pipeline pipeline_model = Pipeline() \ .setStages(stages) \ .fit(df) # Return the dataframe with the additional transformed features vector return pipeline_model.transform(df)
def train(cls, spark, sdf, cat_colnames, num_colnames): string_indexer_list = list() for cat_colname in cat_colnames: string_indexer = StringIndexer(inputCol=cat_colname, outputCol=cat_colname + "_index", handleInvalid="skip") string_indexer_list.append(string_indexer) out = [] pipe = [] if len(num_colnames) > 0: assembler = VectorAssembler(inputCols=num_colnames, outputCol="features_vec") standard_scaler = StandardScaler(inputCol="features_vec", outputCol="features_zs", withMean=True, withStd=True) out = [standard_scaler.getOutputCol()] pipe = [assembler, standard_scaler] assembler_2 = VectorAssembler( inputCols=[x.getOutputCol() for x in string_indexer_list] + out, outputCol="features") estimator = KMeans(featuresCol="features", predictionCol="cluster_id", k=4) clustering_pipeline = Pipeline(stages=string_indexer_list + pipe + [assembler_2] + [estimator]) clustering_pipeline_model = clustering_pipeline.fit(sdf) return KMeansPipeline(pipeline_model=clustering_pipeline_model)
def create_standard_pipeline(self, cross_validate=False): """ This method creates a standard pipeline, standard meaning: vectorize, standardize and model... :return: Pipeline for pyspark, ParameterGrid for Pyspark pipeline """ # Feature columns are created from instance variables # feature_columns = [i.name for i in self._feature_cols] # Vectorized transformation vectorizer = VectorAssembler(inputCols=self._feature_cols, outputCol='v_features') # Cast the vector from mllib to ml converter = ConvertAllToVecToMl(inputCol=vectorizer.getOutputCol(), outputCol='casted') # Standardize estimator standardizes = StandardScaler(withMean=self._standardize, withStd=self._standardize, inputCol=converter.getOutputCol(), outputCol="scaled") # Labels and strings are already set into the model, + dict_parameters = dict( filter(lambda x: not isinstance(x[1], tuple), self._params.items())) dict_parameters['featuresCol'] = standardizes.getOutputCol() dict_parameters['labelCol'] = self._label_col[0] # HACK!!! #print(label_dict) # Model is set model = eval("classification." + self._algorithm)(**dict_parameters) pipe = Pipeline(stages=[vectorizer, converter, standardizes, model]) return pipe
def Model(Data, Tgt='Target', Indp='Nada'): vector_assembler = VectorAssembler( inputCols=Indp, outputCol='assembled_important_features') standard_scaler = StandardScaler(inputCol=vector_assembler.getOutputCol(), outputCol='standardized_features') rf = RandomForestClassifier(featuresCol=standard_scaler.getOutputCol(), labelCol=Tgt) # letters_train, letters_test = letters.randomSplit([0.8,0.2], seed=4) pipeline = Pipeline(stages=[vector_assembler, standard_scaler, rf]) pipeline_model_rf = pipeline.fit(Data) return pipeline_model_rf
def main(spark, data_file, model_file): '''Main routine for supervised training Parameters ---------- spark : SparkSession object data_file : string, path to the parquet file to load model_file : string, path to store the serialized model file ''' ### # TODO: YOUR CODE GOES HERE ### # Read data df = spark.read.parquet(data_file) # Take 1/10 data without replacement df = df.sample(False, 0.1, seed = 0) # Vectorize selected features features = ['mfcc_' + '%.2d' % i for i in range(20)] assembler = VectorAssembler(inputCols=features, outputCol="vectorized_features") # Standardize the features scaler = StandardScaler(inputCol="vectorized_features", outputCol="scaled_features", withStd=True, withMean=False) # Transform string target variable into numerical indexer = StringIndexer(inputCol="genre", outputCol="label", handleInvalid = "skip") # Build logistic regression lr = LogisticRegression(maxIter=20, featuresCol = scaler.getOutputCol(), labelCol=indexer.getOutputCol()) # Build a pipeline pipeline = Pipeline(stages = [assembler, scaler, indexer, lr]) # Build parameter grid and cross validation paramGrid = ParamGridBuilder().addGrid(lr.elasticNetParam,[0.1,0.3,0.5,0.8]).addGrid(lr.regParam, [0.1,0.08,0.05,0.02,0.01]).build() crossval = CrossValidator(estimator = pipeline, estimatorParamMaps = paramGrid, evaluator = MulticlassClassificationEvaluator(), numFolds = 5) # Save model cvModel = crossval.fit(df) cvModel.bestModel.write().overwrite().save(model_file)
tgt_other_agent_indexer.getOutputCol() ], outputCols=[ "country_code_ohe", "geoname_ohe", "source_ohe", "src_actor_ohe", "src_agent_ohe", "src_other_agent_ohe", "target_ohe", "tgt_actor_ohe", "tgt_agent_ohe", "tgt_other_agent_ohe" ], handleInvalid='keep', dropLast=True) # Combine all features into a single column feature_assembler = VectorAssembler(inputCols=ohe.getOutputCols() + [goldstein_scaler.getOutputCol()], outputCol="features") # Index root_code labels label_indexer = StringIndexer( inputCol="root_code", outputCol="indexedLabel").setHandleInvalid('skip') # Select a subset of important features feature_selector = ChiSqSelector( percentile=0.5, featuresCol=feature_assembler.getOutputCol(), labelCol=label_indexer.getOutputCol(), outputCol="selected_features") # Train a RandomForest model
# |-- genre: string (nullable = true) # |-- label: integer (nullable = false) # Preparing Data for Machine Learning from pyspark.ml.feature import VectorAssembler, PCA, StringIndexer, StandardScaler from pyspark.ml import Pipeline numeric_features = [ t[0] for t in binary_audio_feature_genre.dtypes if t[1] == 'double' ] assembler = VectorAssembler(inputCols=numeric_features, outputCol="VEC-FEATURES") standard_scaler = StandardScaler(inputCol=assembler.getOutputCol(), outputCol="SCALED_FEATURES") pca = PCA(k=5, inputCol=standard_scaler.getOutputCol(), outputCol="features") # use Pipeline to chain multiple Transformers and Estimators together to specify our machine learning workflow pipeline = Pipeline(stages=[assembler, standard_scaler, pca]) pipelineModel = pipeline.fit(training) training = pipelineModel.transform(training).select('genre', 'features', 'label') test = pipelineModel.transform(test).select('genre', 'features', 'label') # check the training data after transformers training.show() test.show() # Train the Logistic Regression
import time from pyspark.sql import SparkSession if __name__ == "__main__": spark = SparkSession\ .builder\ .appName("PCAExample")\ .getOrCreate() dataset = spark.read.format("csv").options(header='false', inferSchema='true', delimiter=',').load(sys.argv[1]) t0 = time.time() assembler = VectorAssembler(inputCols=[ '_c0', '_c1', '_c2', '_c3', '_c4', '_c5', '_c6', '_c7', '_c8', '_c9', '_c10', '_c11', '_c12', '_c13', '_c14', '_c15', '_c16', '_c17', '_c18', '_c20', '_c21' ], outputCol="features") df = assembler.transform(dataset) scaler = StandardScaler(inputCol="features", outputCol="scaledFeatures", withStd=False, withMean=True) pca = PCA(k=1, inputCol=scaler.getOutputCol(), outputCol="pcaFeatures") pipeline = Pipeline(stages=[scaler, pca]) model = pipeline.fit(df) result = model.transform(df).select("pcaFeatures") print(time.time() - t0) spark.stop()
from pyspark.ml import Pipeline from pyspark.sql import SparkSession from sklearn.datasets import load_iris import mlflow spark = SparkSession.builder.getOrCreate() mlflow.pyspark.ml.autolog() df = load_iris(as_frame=True).frame.rename(columns={"target": "label"}) df = spark.createDataFrame(df) train, test = df.randomSplit([0.8, 0.2]) assembler = VectorAssembler(inputCols=df.columns[:-1], outputCol="features") scaler = StandardScaler(inputCol=assembler.getOutputCol(), outputCol="scaledFeatures") lor = LogisticRegression(maxIter=5, featuresCol=scaler.getOutputCol()) # Non-neseted pipeline pipeline = Pipeline(stages=[assembler, scaler, lor]) with mlflow.start_run(): pipeline_model = pipeline.fit(train) columns = ["features", "prediction"] pipeline_model.transform(test).select(columns).show() # Nested pipeline nested_pipeline = Pipeline(stages=[Pipeline(stages=[assembler, scaler]), lor]) with mlflow.start_run(): nested_pipeline_model = nested_pipeline.fit(train) nested_pipeline_model.transform(test).select(columns).show()