def buildModel(self,save_pipe_path=None): df=self.getModelData() label_index=fea.StringIndexer(inputCol='user_type',outputCol='label') reTokenizer=fea.RegexTokenizer(inputCol='appnames',outputCol='appname_token',pattern=',') cnt_vector=fea.CountVectorizer(inputCol='appname_token',outputCol='appname_vector') vecAssembler = fea.VectorAssembler(inputCols=['appname_vector'], outputCol="feature") scaler=fea.StandardScaler(inputCol='feature',outputCol='features') if not save_pipe_path: lr=LogisticRegression() grid=ParamGridBuilder().addGrid(lr.elasticNetParam,[0,1]).build() evaluator=BinaryClassificationEvaluator(metricName="areaUnderPR") pipeline = Pipeline(stages=[label_index,reTokenizer, cnt_vector,vecAssembler,scaler]) pipe = pipeline.fit(df) pipe_out=pipe.transform(df) cv=CrossValidator(estimator=lr,estimatorParamMaps=grid,evaluator=evaluator) model=cv.fit(pipe_out) print evaluator.evaluate(model.transform(pipe_out)) print 'Best Param (regParam): ', model.bestModel._java_obj.getElasticNetParam() predict_result=model.transform(pipe_out).select('probability','label').toPandas() predict_result.to_csv('/home/chenchen/data/predict_result1.csv',index=False) else: lr=LogisticRegression(elasticNetParam=1.0) pipeline=Pipeline(stages=[label_index,reTokenizer, cnt_vector,vecAssembler,scaler,lr]) model=pipeline.fit(df) model.save(save_pipe_path) print 'pipe saved'
def std_scaler(X, inputCol="features", outputCol="resFeatures"): scaler = normalizer.StandardScaler(inputCol=inputCol, outputCol=outputCol, withStd=True, withMean=False) scalerModel = scaler.fit(X) scaledData = scalerModel.transform(X) return scaledData
def _scale_data_frame(df, vector=None): if vector: df = df.withColumn(vector, udf(_to_dense, ml_linalg.VectorUDT())(vector)) scale = feature.StandardScaler( withMean=True, withStd=True, inputCol=vector, outputCol='std_vector') model = scale.fit(df) return (model .transform(df) .select([i for i in df.columns if i != vector] + [scale.getOutputCol()]) .withColumnRenamed(existing=scale.getOutputCol(), new=vector))
def _vector_scale(self, df): to_dense_udf = F.udf(self._to_dense, linalg.VectorUDT()) feature_str = 'features' vector_df = df.withColumn(colName=feature_str, col=to_dense_udf(*self._list_feature)) if self._bool_standardize: scaling_model = features.StandardScaler( inputCol=feature_str, outputCol="scaled_features", withMean=True, withStd=True).fit(vector_df) else: scaling_model = features.StandardScaler( inputCol=feature_str, outputCol="scaled_features", withMean=False, withStd=False).fit(vector_df) scaled_df = scaling_model.transform(vector_df) return scaled_df
'EUMEAT', 'EUPRPMEL', 'TUACTIVITY_N', 'tuactdur24', 'tewhere', 'TESEX' ], outputCol='features') # COMMAND ---------- vecIntercept = feature.VectorAssembler(inputCols=[], outputCol='emptyFeatures') # COMMAND ---------- # MAGIC %md # MAGIC Scaling stage to scale features from Vector Assembler # COMMAND ---------- scaled = feature.StandardScaler(inputCol='features', outputCol='sclaedFeatures') # COMMAND ---------- # MAGIC %md # MAGIC Three Linear Regression Pipleline stage # MAGIC 1 - LR with just the intercept # MAGIC 2 - LR with all features unscaled # MAGIC 3 - LR with all features and scaled stage # COMMAND ---------- regIntercept = regression.LinearRegression(labelCol='ERBMI', featuresCol='emptyFeatures') # COMMAND ----------
X_Opt = X[:,[1,4,5]] regressor_OLS = sm.OLS(endog=y, exog=X_Opt).fit() regressor_OLS.summary() #choice feature cols feature_cols = [df_train.columns[1], df_train.columns[2], df_train.columns[3], df_train.columns[4]] #feature_cols = df.columns[1:] assembler = feature.VectorAssembler(inputCols=feature_cols, outputCol='features') df_train = assembler.setHandleInvalid("skip").transform(df_train) df_train = df_train.withColumnRenamed('Survived', 'label') df_train = df_train.select('features', 'label') # scaling scaler = feature.StandardScaler(inputCol="features", outputCol="scaledFeatures", withStd=True, withMean=False) scaler = scaler.fit(df_train) df_train =scaler.transform(df_train) df_train = df_train.drop('features').withColumnRenamed('scaledFeatures','features') # TEST # reading data_train df_test = spark.read.csv("test.csv",header=True, inferSchema=True) df_test = df_test.drop('Name', 'Ticket', 'Cabin') # convert categorical to numeric chn = [] for col in df_test.columns: count = df_test.filter('{} is null'.format(col)).count()
x for x in encoded.columns if x not in {'userId', 'userChurnFlag'} ] assembler = smf.VectorAssembler(inputCols=feature_cols, outputCol='features') encoded = assembler.transform(encoded) encoded = encoded.drop(*feature_cols) encoded = encoded.withColumnRenamed('userChurnFlag', 'label') encoded = encoded.persist() encoded_sample = encoded.limit(1000).toPandas() # Split out validation dataset train, val = encoded.randomSplit([3.0, 1.0], seed=42) # Set up pipeline for model training/evaluation scaler = smf.StandardScaler(withStd=True, withMean=False, inputCol='features', outputCol='scaledFeatures') # Use PCA to reduce dimensionality of scaled vectors reducer = smf.PCA(k=10, inputCol=scaler.getOutputCol(), outputCol='selectedFeatures') # Use a classifier to generate the final predictions classifier = smc.GBTClassifier(labelCol='label', featuresCol=reducer.getOutputCol(), predictionCol='predictedLabel') # Combine all steps in a pipeline pipeline = sm.Pipeline(stages=[scaler, reducer, classifier])
ft.StringIndexer(inputCol=c, outputCol="{0}_indexed".format(c)) for c in nomial_features ] encoders = [ ft.OneHotEncoder(inputCol=indexer.getOutputCol(), outputCol="{0}_encoded".format(indexer.getOutputCol())) for indexer in indexers ] assembler_onehot = ft.VectorAssembler( inputCols=[encoder.getOutputCol() for encoder in encoders], outputCol="onehot_features") #scaler assembler_numeric = ft.VectorAssembler(inputCols=numeric_features, outputCol="numeric_features") std_scaler = ft.StandardScaler(inputCol="numeric_features", outputCol="numeric_features_scaled") assembler_final = ft.VectorAssembler( inputCols=['onehot_features', 'numeric_features_scaled'], outputCol="final_features") pca_model = ft.PCA(k=6, inputCol="final_features", outputCol="pca_features") pipeline = Pipeline(stages=indexers + encoders + [ assembler_onehot, assembler_numeric, std_scaler, assembler_final, pca_model ]) preprocess_model = pipeline.fit(df) scaledData = preprocess_model.transform(df) # 保存和加载模型,save model load model from pyspark.ml import PipelineModel